1// Copyright (c) 2013, Mike Samuel 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions 6// are met: 7// 8// Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// Neither the name of the OWASP nor the names of its contributors may 14// be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27// POSSIBILITY OF SUCH DAMAGE. 28 29package org.owasp.html; 30 31import java.util.ArrayList; 32import java.util.Iterator; 33import java.util.List; 34import java.util.NoSuchElementException; 35 36import javax.annotation.Nullable; 37 38import com.google.common.collect.ImmutableMap; 39 40/** 41 * Given a string of CSS, produces a string of normalized CSS with certain 42 * useful properties detailed below. 43 * <ul> 44 * <li>All runs of white-space and comment tokens (including CDO and CDC) 45 * have been replaced with a single space character.</li> 46 * <li>All strings are quoted and escapes are escaped according to the 47 * following scheme: 48 * <table> 49 * <tr><td>NUL</td> <td><code>\0</code></tr> 50 * <tr><td>line feed</td> <td><code>\a</code></tr> 51 * <tr><td>vertical feed</td> <td><code>\c</code></tr> 52 * <tr><td>carriage return</td><td><code>\d</code></tr> 53 * <tr><td>double quote</td> <td><code>\22</code></tr> 54 * <tr><td>ampersand &</td><td><code>\26</code></tr> 55 * <tr><td>single quote</td> <td><code>\27</code></tr> 56 * <tr><td>left-angle <</td><td><code>\3c</code></tr> 57 * <tr><td>rt-angle ></td> <td><code>\3e</code></tr> 58 * <tr><td>back slash</td> <td><code>\\</code></tr> 59 * <tr><td>all others</td> <td>raw</td></tr> 60 * </table> 61 * </li> 62 * <li>All <code>url(…)</code> tokens are quoted. 63 * <li>All keywords, identifiers, and hex literals are lower-case and have 64 * embedded escape sequences decoded, except that .</li> 65 * <li>All brackets nest properly.</li> 66 * <li>Does not contain any case-insensitive variant of the sequences 67 * {@code <!--}, {@code -->}, {@code <![CDATA[}, {@code ]]>}, or 68 * {@code </style}.</li> 69 * <li>All delimiters that can start longer tokens are followed by a space. 70 * </ul> 71 */ 72final class CssTokens implements Iterable<String> { 73 74 public final String normalizedCss; 75 public final Brackets brackets; 76 private final int[] tokenBreaks; 77 private final TokenType[] tokenTypes; 78 79 public TokenIterator start() { 80 return new TokenIterator(tokenTypes.length); 81 } 82 83 public TokenIterator iterator() { return start(); } 84 85 public static CssTokens lex(String css) { 86 Lexer lexer = new Lexer(css); 87 lexer.lex(); 88 return lexer.build(); 89 } 90 91 /** A cursor into a list of tokens. */ 92 public final class TokenIterator implements Iterator<String> { 93 private int tokenIndex = 0; 94 private final int limit; 95 96 TokenIterator(int limit) { 97 this.limit = limit; 98 } 99 100 public boolean hasNext() { 101 return hasToken(); 102 } 103 104 public String next() { 105 String token = token(); 106 advance(); 107 return token; 108 } 109 110 public @Nullable TokenIterator spliceToEnd() { 111 if (!hasNext()) { throw new NoSuchElementException(); } 112 int end = brackets.partner(tokenIndex); 113 if (end < 0) { 114 return null; 115 } 116 TokenIterator between = new TokenIterator(end); 117 between.tokenIndex = tokenIndex + 1; 118 tokenIndex = end + 1; 119 return between; 120 } 121 122 public int tokenIndex() { 123 return tokenIndex; 124 } 125 126 public int startOffset() { 127 return tokenBreaks[tokenIndex]; 128 } 129 130 public int endOffset() { 131 return tokenBreaks[tokenIndex+1]; 132 } 133 134 public String token() { 135 return normalizedCss.substring(startOffset(), endOffset()); 136 } 137 138 public boolean hasToken() { 139 return tokenIndex < limit; 140 } 141 142 public boolean hasTokenAfterSpace() { 143 while (hasToken()) { 144 if (type() != TokenType.WHITESPACE) { return true; } 145 advance(); 146 } 147 return false; 148 } 149 150 /** The type of the current token. */ 151 public TokenType type() { 152 return tokenTypes[tokenIndex]; 153 } 154 155 public void seek(int tokenIndex) { 156 this.tokenIndex = tokenIndex; 157 } 158 159 public void advance() { 160 if (!hasToken()) { throw new NoSuchElementException(); } 161 ++tokenIndex; 162 } 163 164 public void backup() { 165 if (tokenIndex == 0) { throw new NoSuchElementException(); } 166 --tokenIndex; 167 } 168 169 public void remove() throws UnsupportedOperationException { 170 throw new UnsupportedOperationException(); 171 } 172 } 173 174 private CssTokens( 175 String normalizedCss, Brackets brackets, int[] tokenBreaks, 176 TokenType[] tokenTypes) { 177 this.normalizedCss = normalizedCss; 178 this.brackets = brackets; 179 this.tokenBreaks = tokenBreaks; 180 this.tokenTypes = tokenTypes; 181 } 182 183 public enum TokenType { 184 /** An identifier. */ 185 IDENT, 186 /** An identifier prefixed with a period. */ 187 DOT_IDENT, 188 /** A function name and opening bracket. */ 189 FUNCTION, 190 /** An {@code @<identifier>} directive token. */ 191 AT, 192 /** A hash token that contains non-hex characters. */ 193 HASH_ID, 194 /** A hash token that could be a color literal. */ 195 HASH_UNRESTRICTED, 196 /** A quoted string. */ 197 STRING, 198 /** A URL of the form <code>url("...")</code>. */ 199 URL, 200 /** A single character. */ 201 DELIM, 202 /** A scalar numeric value. */ 203 NUMBER, 204 /** A percentage. */ 205 PERCENTAGE, 206 /** A numeric value with a unit suffix. */ 207 DIMENSION, 208 /** A numeric value with an unknown unit suffix. */ 209 BAD_DIMENSION, 210 /** {@code U+<hex-or-qmark>} */ 211 UNICODE_RANGE, 212 /** 213 * include-match, dash-match, prefix-match, suffix-match, substring-match 214 */ 215 MATCH, 216 /** {@code ||} */ 217 COLUMN, 218 /** A run of white-space, comment, CDO, and CDC tokens. */ 219 WHITESPACE, 220 /** {@code :} */ 221 COLON, 222 /** {@code ;} */ 223 SEMICOLON, 224 /** {@code ,} */ 225 COMMA, 226 /** {@code [} */ 227 LEFT_SQUARE, 228 /** {@code ]} */ 229 RIGHT_SQUARE, 230 /** {@code (} */ 231 LEFT_PAREN, 232 /** {@code )} */ 233 RIGHT_PAREN, 234 /** <code>{</code> */ 235 LEFT_CURLY, 236 /** <code>}</code> */ 237 RIGHT_CURLY, 238 ; 239 } 240 241 /** 242 * Maps tokens to their partners. A close bracket token like {@code (} may 243 * have a partner token like {@code )} if properly nested, and vice-versa. 244 */ 245 static final class Brackets { 246 /** 247 * For each token index, the index of the indexed token's partner or -1 if 248 * it has none. 249 */ 250 private final int[] brackets; 251 252 private Brackets(int[] brackets) { 253 this.brackets = brackets; 254 } 255 256 /** The index of the partner token or -1 if none. */ 257 int partner(int tokenIndex) { 258 int bracketIndex = bracketIndexForToken(tokenIndex); 259 if (bracketIndex < 0) { return -1; } 260 return brackets[(bracketIndex << 1) + 1]; 261 } 262 263 int bracketIndexForToken(int target) { 264 // Binary search by leftmost element of pair. 265 int left = 0; 266 int right = brackets.length >> 1; 267 while (left < right) { 268 int mid = left + ((right - left) >> 1); 269 int value = brackets[mid << 1]; 270 if (value == target) { return mid; } 271 if (value < target) { 272 left = mid + 1; 273 } else { 274 right = mid; 275 } 276 } 277 return -1; 278 } 279 } 280 281 private static final int[] ZERO_INTS = new int[0]; 282 283 private static final TokenType[] ZERO_TYPES = new TokenType[0]; 284 285 private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS); 286 287 private static final CssTokens EMPTY = new CssTokens( 288 "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES); 289 290 /** 291 * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/ 292 */ 293 private static final class Lexer { 294 private final String css; 295 private final StringBuilder sb; 296 private int pos = 0; 297 private final int cssLimit; 298 299 private List<TokenType> tokenTypes = null; 300 private int[] tokenBreaks = new int[128]; 301 private int tokenBreaksLimit = 0; 302 303 /** 304 * For each bracket, 2 ints: the token index of the bracket, and the token 305 * index of its partner. 306 * The array is sorted by the first int. 307 * The second int is -1 when the bracket has not yet been closed. 308 */ 309 private int[] brackets = ZERO_INTS; 310 /** 311 * The number of elements in {@link #brackets} that are valid. 312 * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow 313 * into. 314 */ 315 private int bracketsLimit = 0; 316 /** 317 * For each bracket that has not been closed, 2 ints: 318 * its index in {@link #brackets} and the character of its close bracket 319 * as an int. 320 * This is a bracket stack so the array is sorted by the first int. 321 */ 322 private int[] open = ZERO_INTS; 323 /** 324 * The number of elements in {@link #open} that are valid. 325 * {@code open[openLimit:]} is garbage space that the stack can grow into. 326 */ 327 private int openLimit = 0; 328 329 Lexer(String css) { 330 this.css = css; 331 this.sb = new StringBuilder(); 332 this.cssLimit = css.length(); 333 } 334 335 TokenType openBracket(char bracketChar) { 336 char close; 337 TokenType type; 338 switch (bracketChar) { 339 case '(': close = ')'; type = TokenType.LEFT_PAREN; break; 340 case '[': close = ']'; type = TokenType.LEFT_SQUARE; break; 341 case '{': close = '}'; type = TokenType.LEFT_CURLY; break; 342 default: 343 throw new AssertionError("Invalid open bracket " + bracketChar); 344 } 345 brackets = expandIfNecessary(brackets, bracketsLimit, 2); 346 open = expandIfNecessary(open, openLimit, 2); 347 open[openLimit++] = bracketsLimit; 348 open[openLimit++] = close; 349 brackets[bracketsLimit++] = tokenBreaksLimit; 350 brackets[bracketsLimit++] = -1; 351 sb.append(bracketChar); 352 return type; 353 } 354 355 void closeBracket(char bracketChar) { 356 int openLimitAfterClose = openLimit; 357 do { 358 if (openLimitAfterClose == 0) { 359 // Drop an orphaned close bracket. 360 breakOutput(); 361 return; 362 } 363 openLimitAfterClose -= 2; 364 } while (bracketChar != open[openLimitAfterClose + 1]); 365 closeBrackets(openLimitAfterClose); 366 } 367 368 private void closeBrackets(int openLimitAfterClose) { 369 // Make sure we've got space on brackets. 370 int spaceNeeded = openLimit - openLimitAfterClose; 371 brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded); 372 373 int closeTokenIndex = tokenBreaksLimit; 374 while (openLimit > openLimitAfterClose) { 375 // Pop the stack. 376 int closeBracket = open[--openLimit]; 377 int openBracketIndex = open[--openLimit]; 378 int openTokenIndex = brackets[openBracketIndex]; 379 // Update open bracket to point to its partner. 380 brackets[openBracketIndex + 1] = closeTokenIndex; 381 // Emit the close bracket. 382 brackets[bracketsLimit++] = closeTokenIndex; 383 brackets[bracketsLimit++] = openTokenIndex; 384 sb.appendCodePoint(closeBracket); 385 closeTokenIndex++; 386 } 387 } 388 389 CssTokens build() { 390 // Close any still open brackets. 391 { 392 int startOfCloseBrackets = sb.length(); 393 closeBrackets(0); 394 emitMergedTokens(startOfCloseBrackets, sb.length()); 395 } 396 397 if (tokenTypes == null) { return EMPTY; } 398 int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit); 399 400 // Strip any trailing space off, since it may have been inserted by a 401 // breakAfter call anyway. 402 int cssEnd = sb.length(); 403 if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') { 404 --cssEnd; 405 tokenTypes.remove(--tokenBreaksLimit); 406 } 407 String normalizedCss = sb.substring(0, cssEnd); 408 409 // Store the last character on the tokenBreaksList to simplify finding the 410 // end of a token. 411 tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); 412 tokenBreaks[tokenBreaksLimit++] = normalizedCss.length(); 413 414 int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit); 415 TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES); 416 417 return new CssTokens( 418 normalizedCss, new Brackets(bracketsTrunc), 419 tokenBreaksTrunc, tokenTypesArr); 420 } 421 422 void lex() { 423 // Fast-track no content. 424 consumeIgnorable(); 425 sb.setLength(0); 426 if (pos == cssLimit) { return; } 427 428 tokenTypes = new ArrayList<TokenType>(); 429 430 String css = this.css; 431 int cssLimit = this.cssLimit; 432 while (pos < cssLimit) { 433 assert this.tokenBreaksLimit == this.tokenTypes.size() 434 : "token and types out of sync at " + tokenBreaksLimit 435 + " in `" + css + "`"; 436 // SPEC: 4. Tokenization 437 // The output of the tokenization step is a stream of zero 438 // or more of the following tokens: <ident>, <function>, 439 // <at-keyword>, <hash>, <string>, <bad-string>, <url>, 440 // <bad-url>, <delim>, <number>, <percentage>, 441 // <dimension>, <unicode-range>, <include-match>, 442 // <dash-match>, <prefix-match>, <suffix-match>, 443 // <substring-match>, <column>, <whitespace>, <CDO>, 444 // <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>, 445 // <(>, <)>, <{>, and <}>. 446 447 // IMPLEMENTS: 4.3 Consume a token 448 char ch = css.charAt(pos); 449 int startOfToken = pos; 450 int startOfOutputToken = sb.length(); 451 final TokenType type; 452 switch (ch) { 453 case '\t': case '\n': case '\f': case '\r': case ' ': case '\ufeff': 454 consumeIgnorable(); 455 type = TokenType.WHITESPACE; 456 break; 457 case '/': { 458 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 459 if (lookahead == '/' || lookahead == '*') { 460 consumeIgnorable(); 461 type = TokenType.WHITESPACE; 462 } else { 463 consumeDelim(ch); 464 type = TokenType.DELIM; 465 } 466 break; 467 } 468 case '<': 469 if (consumeIgnorable()) { // <!-- 470 type = TokenType.WHITESPACE; 471 } else { 472 consumeDelim('<'); 473 type = TokenType.DELIM; 474 } 475 break; 476 case '>': 477 breakOutput(); 478 sb.append('>'); 479 type = TokenType.DELIM; 480 ++pos; 481 break; 482 case '@': 483 if (consumeAtKeyword()) { 484 type = TokenType.AT; 485 } else { 486 consumeDelim(ch); 487 type = TokenType.DELIM; 488 } 489 break; 490 case '#': { 491 sb.append('#'); 492 TokenType hashType = consumeHash(); 493 if (hashType != null) { 494 type = hashType; 495 } else { 496 ++pos; 497 sb.append(' '); 498 type = TokenType.DELIM; 499 } 500 break; 501 } 502 case '"': 503 case '\'': 504 type = consumeString(); 505 break; 506 case 'U': case 'u': 507 // SPEC handle URL under "ident like token". 508 if (consumeUnicodeRange()) { 509 type = TokenType.UNICODE_RANGE; 510 } else { 511 type = consumeIdentOrUrlOrFunction(); 512 } 513 break; 514 case '0': case '1': case '2': case '3': case '4': 515 case '5': case '6': case '7': case '8': case '9': 516 type = consumeNumberOrPercentageOrDimension(); 517 break; 518 case '+': case '-': case '.': { 519 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 520 if (isDecimal(lookahead) 521 || (lookahead == '.' && pos + 2 < cssLimit 522 && isDecimal(css.charAt(pos + 2)))) { 523 type = consumeNumberOrPercentageOrDimension(); 524 } else if (ch == '+') { 525 consumeDelim(ch); 526 type = TokenType.DELIM; 527 } else if (ch == '-') { 528 if (consumeIgnorable()) { // --> 529 type = TokenType.WHITESPACE; 530 } else { 531 type = consumeIdentOrUrlOrFunction(); 532 } 533 } else if (isIdentPart(lookahead)) { 534 // treat ".<IDENT>" as one token. 535 sb.append('.'); 536 ++pos; 537 consumeIdent(false); 538 if (pos != startOfToken + 1) { 539 type = TokenType.DOT_IDENT; 540 if (pos < cssLimit) { 541 char next = css.charAt(pos); 542 if ('(' == next) { 543 // A dotted identifier followed by a parenthesis is 544 // ambiguously a function. 545 sb.append(' '); 546 } 547 } 548 } else { 549 type = TokenType.DELIM; 550 sb.append(' '); 551 } 552 } else { 553 consumeDelim('.'); 554 type = TokenType.DELIM; 555 } 556 break; 557 } 558 case ':': consumeDelim(ch); type = TokenType.COLON; break; 559 case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break; 560 case ',': consumeDelim(ch); type = TokenType.COMMA; break; 561 case '[': case '(': case '{': 562 type = openBracket(ch); 563 ++pos; 564 break; 565 case '}': case ')': case ']': 566 closeBracket(ch); 567 ++pos; 568 // Use DELIM so that a later loop will split output into multiple 569 // tokens since we may have inserted missing close brackets for 570 // unclosed open brackets already on the stack. 571 type = TokenType.DELIM; 572 break; 573 case '~': case '|': case '^': case '$': case '*': { 574 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 575 if (lookahead == '=') { 576 consumeMatch(ch); 577 type = TokenType.MATCH; 578 } else if (ch == '|' && lookahead == '|') { 579 consumeColumn(); 580 type = TokenType.COLUMN; 581 } else { 582 consumeDelim(ch); 583 type = TokenType.DELIM; 584 } 585 break; 586 } 587 case '_': 588 type = consumeIdentOrUrlOrFunction(); 589 break; 590 case '\\': { 591 // Optimistically parse as an ident. 592 TokenType identType = consumeIdentOrUrlOrFunction(); 593 if (identType == null) { 594 ++pos; // drop 595 breakOutput(); 596 type = TokenType.WHITESPACE; 597 } else { 598 type = identType; 599 } 600 // TODO: handle case where "url" is encoded. 601 break; 602 } 603 default: 604 int chlower = ch | 32; 605 if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) { 606 TokenType identType = consumeIdentOrUrlOrFunction(); 607 if (identType != null) { 608 type = identType; 609 } else { // Occurs on undefined-codepoints. 610 ++pos; 611 breakOutput(); 612 type = TokenType.WHITESPACE; 613 } 614 } else if (ch > 0x20) { 615 consumeDelim(ch); 616 type = TokenType.DELIM; 617 } else { // Ignore. 618 consumeIgnorable(); 619 type = TokenType.WHITESPACE; 620 } 621 } 622 assert pos > startOfToken 623 : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken) 624 + ":U+" + Integer.toHexString(css.charAt(startOfToken)); 625 int endOfOutputToken = sb.length(); 626 if (endOfOutputToken > startOfOutputToken) { 627 if (type == TokenType.DELIM) { 628 emitMergedTokens(startOfOutputToken, endOfOutputToken); 629 } else { 630 if (type != TokenType.WHITESPACE 631 && sb.charAt(startOfOutputToken) == ' ') { 632 emitToken(TokenType.WHITESPACE, startOfOutputToken); 633 ++startOfOutputToken; 634 assert startOfOutputToken != endOfOutputToken; 635 } 636 emitToken(type, startOfOutputToken); 637 // Token emitters can emit a space after a token to avoid possible 638 // merges with following tokens 639 if (type != TokenType.WHITESPACE) { 640 int sbLen = sb.length(); 641 if (startOfOutputToken + 1 < sbLen 642 && sb.charAt(sbLen - 1) == ' ') { 643 emitToken(TokenType.WHITESPACE, sbLen - 1); 644 } 645 } 646 } 647 } 648 } 649 } 650 651 private void emitMergedTokens(int start, int end) { 652 // Handle breakOutput and merging of output tokens. 653 for (int e = start; e < end; ++e) { 654 TokenType delimType; 655 switch (sb.charAt(e)) { 656 case ' ': delimType = TokenType.WHITESPACE; break; 657 case '}': delimType = TokenType.RIGHT_CURLY; break; 658 case ')': delimType = TokenType.RIGHT_PAREN; break; 659 case ']': delimType = TokenType.RIGHT_SQUARE; break; 660 default : delimType = TokenType.DELIM; break; 661 } 662 emitToken(delimType, e); 663 } 664 } 665 666 private void emitToken(TokenType type, int startOfOutputToken) { 667 if (tokenBreaksLimit == 0 668 || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) { 669 tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); 670 tokenBreaks[tokenBreaksLimit++] = startOfOutputToken; 671 tokenTypes.add(type); 672 } 673 } 674 675 private void consumeDelim(char ch) { 676 sb.append(ch); 677 switch (ch) { 678 // Prevent token merging. 679 case '~': case '|': case '^': case '$': case '\\': 680 case '.': case '+': case '-': case '@': case '/': case '<': 681 sb.append(' '); 682 break; 683 default: 684 break; 685 } 686 ++pos; 687 } 688 689 private boolean consumeIgnorable() { 690 String css = this.css; 691 int cssLimit = this.cssLimit; 692 int posBefore = pos; 693 while (pos < cssLimit) { 694 char ch = css.charAt(pos); 695 if (ch <= 0x20 696 // Treat a BOM as white-space so that it is ignored at the beginning 697 // of a file. 698 || ch == '\ufeff') { 699 ++pos; 700 } else if (pos + 1 == cssLimit) { 701 break; 702 } else if (ch == '/') { 703 char next = css.charAt(pos + 1); 704 if (next == '*') { 705 pos += 2; 706 while (pos < cssLimit) { 707 int ast = css.indexOf('*', pos); 708 if (ast < 0) { 709 pos = cssLimit; // Unclosed /* comment */ 710 break; 711 } else { 712 // Advance over a run of '*'s. 713 pos = ast + 1; 714 while (pos < cssLimit && css.charAt(pos) == '*') { 715 ++pos; 716 } 717 if (pos < cssLimit && css.charAt(pos) == '/') { 718 ++pos; 719 break; 720 } 721 } 722 } 723 } else if (next == '/') { // Non-standard but widely supported 724 while (++pos < cssLimit) { 725 if (isLineTerminator(css.charAt(pos))) { break; } 726 } 727 } else { 728 break; 729 } 730 } else if (ch == '<') { 731 if (pos + 3 < cssLimit 732 && '!' == css.charAt(pos + 1) 733 && '-' == css.charAt(pos + 2) 734 && '-' == css.charAt(pos + 3)) { 735 pos += 4; 736 } else { 737 break; 738 } 739 } else if (ch == '-') { 740 if (pos + 2 < cssLimit 741 && '-' == css.charAt(pos + 1) 742 && '>' == css.charAt(pos + 2)) { 743 pos += 3; 744 } else { 745 break; 746 } 747 } else { 748 break; 749 } 750 } 751 if (pos == posBefore) { 752 return false; 753 } else { 754 breakOutput(); 755 return true; 756 } 757 } 758 759 private void breakOutput() { 760 int last = sb.length() - 1; 761 if (last >= 0 && sb.charAt(last) != ' ') { sb.append(' '); } 762 } 763 764 private void consumeColumn() { 765 pos += 2; 766 sb.append("||"); 767 } 768 769 private void consumeMatch(char ch) { 770 pos += 2; 771 sb.append(ch).append('='); 772 } 773 774 private void consumeIdent(boolean allowFirstDigit) { 775 int cssLimit = this.cssLimit; 776 int last = -1, nCodepoints = 0; 777 int sbAtStart = sb.length(); 778 int posAtStart = pos; 779 while (pos < cssLimit) { 780 int posBefore = pos; 781 782 int decoded = readCodepoint(); 783 if (decoded == '\\') { 784 decoded = consumeAndDecodeEscapeSequence(); 785 } else { 786 ++pos; 787 } 788 789 if (decoded >= 0 && isIdentPart(decoded)) { 790 if (!allowFirstDigit && nCodepoints < 2 791 && '0' <= decoded && decoded <= '9') { 792 // Don't allow encoded identifiers that look like numeric tokens 793 // like \-1 or ones that start with an encoded decimal digit. 794 if (last == '-' || last == -1) { 795 pos = posAtStart; 796 sb.setLength(sbAtStart); 797 return; 798 } 799 } 800 sb.appendCodePoint(decoded); 801 last = decoded; 802 ++nCodepoints; 803 } else { 804 pos = posBefore; 805 return; 806 } 807 } 808 } 809 810 private boolean consumeAtKeyword() { 811 assert css.charAt(pos) == '@'; 812 int bufferLengthBeforeWrite = sb.length(); 813 sb.append('@'); 814 int posBeforeKeyword = ++pos; 815 consumeIdent(false); 816 if (pos == posBeforeKeyword) { 817 --pos; // back up over '@' 818 sb.setLength(bufferLengthBeforeWrite); // Unwrite the '@' 819 return false; 820 } else { 821 return true; 822 } 823 } 824 825 826 private int consumeAndDecodeEscapeSequence() { 827 String css = this.css; 828 int cssLimit = this.cssLimit; 829 assert css.charAt(pos) == '\\'; 830 if (pos + 1 >= cssLimit) { return -1; } 831 char esc = css.charAt(pos + 1); 832 if (isLineTerminator(esc)) { return -1; } 833 int escLower = esc | 32; 834 if (('0' <= esc && esc <= '9') 835 || ('a' <= escLower && escLower <= 'f')) { 836 int hexValue = 0; 837 int hexStart = pos + 1; 838 int hexLimit = Math.min(pos + 7, cssLimit); 839 int hexEnd = hexStart; 840 do { 841 hexValue = (hexValue << 4) 842 | (esc <= '9' ? esc - '0' : escLower - ('a' - 10)); 843 ++hexEnd; 844 if (hexEnd == hexLimit) { break; } 845 esc = css.charAt(hexEnd); 846 escLower = esc | 32; 847 } while (('0' <= esc && esc <= '9') 848 || ('a' <= escLower && escLower <= 'f')); 849 if (!Character.isDefined(hexValue)) { 850 hexValue = 0xfffd; 851 } 852 pos = hexEnd; 853 if (pos < cssLimit) { 854 // A sequence of hex digits can be followed by a space that allows 855 // so that code-point U+A followed by the letter 'b' can be rendered 856 // as "\a b" since "\ab" specifies the single code-point U+AB. 857 char next = css.charAt(pos); 858 if (next == ' ' || next == '\t' || isLineTerminator(next)) { 859 ++pos; 860 } 861 } 862 return hexValue; 863 } 864 pos += 2; 865 return esc; 866 } 867 868 private static final long HEX_ENCODED_BITMASK = 869 (1L << 0) | LINE_TERMINATOR_BITMASK 870 | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>'); 871 private static boolean isHexEncoded(int codepoint) { 872 return (0 <= codepoint && codepoint < 63 873 && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK)); 874 } 875 876 private void encodeCharOntoOutput(int codepoint, int last) { 877 switch (codepoint) { 878 case '\\': sb.append("\\\\"); break; 879 case '\0': sb.append("\\0"); break; 880 case '\n': sb.append("\\a"); break; 881 case '\f': sb.append("\\c"); break; 882 case '\r': sb.append("\\d"); break; 883 case '\"': sb.append("\\22"); break; 884 case '&': sb.append("\\26"); break; 885 case '\'': sb.append("\\27"); break; 886 case '<': sb.append("\\3c"); break; 887 case '>': sb.append("\\3e"); break; 888 // The set of escapes above that end with a hex digit must appear in 889 // HEX_ENCODED_BITMASK. 890 case '-': 891 sb.append('-'); 892 break; 893 default: 894 if (isHexEncoded(last) 895 // We need to put a space after a trailing hex digit if the 896 // next encoded character on the output would be another hex 897 // digit or a space character. The other space characters 898 // are handled above. 899 && (codepoint == ' ' || codepoint == '\t' 900 || ('0' <= codepoint && codepoint <= '9') 901 || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) { 902 sb.append(' '); 903 } 904 sb.appendCodePoint(codepoint); 905 break; 906 } 907 } 908 909 private TokenType consumeNumberOrPercentageOrDimension() { 910 String css = this.css; 911 int cssLimit = this.cssLimit; 912 boolean isZero = true; 913 int intStart = pos; 914 if (intStart < cssLimit) { 915 char ch = css.charAt(intStart); 916 if (ch == '-' || ch == '+') { 917 ++intStart; 918 } 919 } 920 // Find the integer part after any sign. 921 int intEnd = intStart; 922 for (; intEnd < cssLimit; ++intEnd) { 923 char ch = css.charAt(intEnd); 924 if (!('0' <= ch && ch <= '9')) { break; } 925 if (ch != '0') { isZero = false; } 926 } 927 // Find a fraction like ".5" or ".". 928 int fractionStart = intEnd; 929 int fractionEnd = fractionStart; 930 if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) { 931 ++fractionEnd; 932 for (; fractionEnd < cssLimit; ++fractionEnd) { 933 char ch = css.charAt(fractionEnd); 934 if (!('0' <= ch && ch <= '9')) { break; } 935 if (ch != '0') { isZero = false; } 936 } 937 } 938 int exponentStart = fractionEnd; 939 int exponentIntStart = exponentStart; 940 int exponentEnd = exponentStart; 941 boolean isExponentZero = true; 942 if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) { 943 // 'e' and 'e' in "5e-f" for a 944 exponentEnd = exponentStart + 1; 945 if (exponentEnd < cssLimit) { 946 char ch = css.charAt(exponentEnd); 947 if (ch == '+' || ch == '-') { ++exponentEnd; } 948 } 949 exponentIntStart = exponentEnd; 950 for (; exponentEnd < cssLimit; ++exponentEnd) { 951 char ch = css.charAt(exponentEnd); 952 if (!('0' <= ch && ch <= '9')) { break; } 953 if (ch != '0') { isExponentZero = false; } 954 } 955 // Since 956 // dimension := <number> <ident> 957 // the below are technically valid dimensions even though they appear 958 // to have incomplete exponents: 959 // 5e 960 // 5ex 961 // 5e- 962 if (exponentEnd == exponentIntStart) { // Incomplete exponent. 963 exponentIntStart = exponentEnd = exponentStart; 964 isExponentZero = true; 965 } 966 } 967 968 int unitStart = exponentEnd; 969 // Skip over space between number and unit. 970 // Many user-agents allow "5 ex" instead of "5ex". 971 while (unitStart < cssLimit) { 972 char ch = css.charAt(unitStart); 973 if (ch == ' ' || isLineTerminator(ch)) { 974 ++unitStart; 975 } else { 976 break; 977 } 978 } 979 980 if (sb.length() != 0 && isIdentPart(sb.charAt(sb.length() - 1))) { 981 sb.append(' '); 982 } 983 // Normalize the number onto the buffer. 984 // We will normalize and unit later. 985 // Skip the sign if it is positive. 986 if (intStart != pos && '-' == css.charAt(pos) && !isZero) { 987 sb.append('-'); 988 } 989 if (isZero) { 990 sb.append('0'); 991 } else { 992 // Strip leading zeroes from the integer and exponent and trailing 993 // zeroes from the fraction. 994 while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; } 995 while (fractionEnd > fractionStart 996 && css.charAt(fractionEnd - 1) == '0') { 997 --fractionEnd; 998 } 999 if (intStart == intEnd) { 1000 sb.append('0'); // .5 -> 0.5 1001 } else { 1002 sb.append(css, intStart, intEnd); 1003 } 1004 if (fractionEnd > fractionStart + 1) { // 5. -> 5; 5.0 -> 5 1005 sb.append(css, fractionStart, fractionEnd); 1006 } 1007 if (!isExponentZero) { 1008 sb.append('e'); 1009 // 1e+1 -> 1e1 1010 if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); } 1011 while (exponentIntStart < exponentEnd 1012 && css.charAt(exponentIntStart) == '0') { 1013 ++exponentIntStart; 1014 } 1015 sb.append(css, exponentIntStart, exponentEnd); 1016 } 1017 } 1018 1019 int unitEnd; 1020 TokenType type; 1021 if (unitStart < cssLimit && '%' == css.charAt(unitStart)) { 1022 unitEnd = unitStart + 1; 1023 type = TokenType.PERCENTAGE; 1024 sb.append('%'); 1025 } else { 1026 // The grammar says that any identifier following a number is a unit. 1027 int bufferBeforeUnit = sb.length(); 1028 pos = unitStart; 1029 consumeIdent(false); 1030 int bufferAfterUnit = sb.length(); 1031 boolean knownUnit = isWellKnownUnit( 1032 sb, bufferBeforeUnit, bufferAfterUnit); 1033 if (unitStart == exponentEnd // No intervening space 1034 || knownUnit) { 1035 unitEnd = pos; 1036 // 3IN -> 3in 1037 for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) { 1038 char ch = sb.charAt(i); 1039 if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); } 1040 } 1041 } else { 1042 unitEnd = unitStart = exponentEnd; 1043 sb.setLength(bufferBeforeUnit); 1044 } 1045 type = unitStart == unitEnd 1046 ? TokenType.NUMBER 1047 : knownUnit 1048 ? TokenType.DIMENSION 1049 : TokenType.BAD_DIMENSION; 1050 } 1051 pos = unitEnd; 1052 if (type != TokenType.PERCENTAGE 1053 && pos < cssLimit && css.charAt(pos) == '.') { 1054 sb.append(' '); 1055 } 1056 return type; 1057 } 1058 1059 private TokenType consumeString() { 1060 String css = this.css; 1061 int cssLimit = this.cssLimit; 1062 1063 char delim = css.charAt(pos); 1064 assert delim == '"' || delim == '\''; 1065 ++pos; 1066 int startOfStringOnOutput = sb.length(); 1067 sb.append('\''); 1068 int last = -1; 1069 boolean closed = false; 1070 while (pos < cssLimit) { 1071 char ch = css.charAt(pos); 1072 if (ch == delim) { 1073 ++pos; 1074 closed = true; 1075 break; 1076 } 1077 if (isLineTerminator(ch)) { break; } 1078 int decoded = ch; 1079 if (ch == '\\') { 1080 if (pos + 1 < cssLimit && isLineTerminator(css.charAt(pos+1))) { 1081 // consume it but generate no tokens. 1082 // Lookahead to treat a \r\n sequence as one line-terminator. 1083 if (pos + 2 < cssLimit 1084 && css.charAt(pos+1) == '\r' && css.charAt(pos+2) == '\n') { 1085 pos += 3; 1086 } else { 1087 pos += 2; 1088 } 1089 continue; 1090 } else { 1091 decoded = consumeAndDecodeEscapeSequence(); 1092 if (decoded < 0) { 1093 break; 1094 } 1095 } 1096 } else { 1097 ++pos; 1098 } 1099 encodeCharOntoOutput(decoded, last); 1100 last = decoded; 1101 } 1102 if (closed) { 1103 sb.append('\''); 1104 return TokenType.STRING; 1105 } else { // Drop <bad-string>s 1106 sb.setLength(startOfStringOnOutput); 1107 breakOutput(); 1108 return TokenType.WHITESPACE; 1109 } 1110 } 1111 1112 private @Nullable TokenType consumeHash() { 1113 assert css.charAt(pos) == '#'; 1114 ++pos; 1115 int beforeIdent = pos; 1116 consumeIdent(true); 1117 if (pos == beforeIdent) { 1118 pos = beforeIdent - 1; 1119 return null; 1120 } 1121 for (int i = beforeIdent; i < pos; ++i) { 1122 char chLower = (char) (css.charAt(i) | 32); 1123 if (!(('0' <= chLower && chLower <= '9') 1124 || ('a' <= chLower && chLower <= 'f'))) { 1125 return TokenType.HASH_ID; 1126 } 1127 } 1128 return TokenType.HASH_UNRESTRICTED; 1129 } 1130 1131 private boolean consumeUnicodeRange() { 1132 final String css = this.css; 1133 final int cssLimit = this.cssLimit; 1134 1135 assert pos < cssLimit && (css.charAt(pos) | 32) == 'u'; 1136 1137 final int start = pos; 1138 final int startOfOutput = sb.length(); 1139 ++pos; 1140 boolean ok = false; 1141 parse: 1142 try { 1143 if (pos == cssLimit || css.charAt(pos) != '+') { 1144 break parse; 1145 } 1146 ++pos; 1147 sb.append("U+"); 1148 int numStartDigits = 0; 1149 while (pos < cssLimit && numStartDigits < 6) { 1150 char chLower = (char) (css.charAt(pos) | 32); 1151 if (('0' <= chLower && chLower <= '9') 1152 || ('a' <= chLower && chLower <= 'f')) { 1153 sb.append(chLower); 1154 ++numStartDigits; 1155 ++pos; 1156 } else { 1157 break; 1158 } 1159 } 1160 if (numStartDigits == 0) { 1161 break parse; 1162 } 1163 boolean hasQmark = false; 1164 while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') { 1165 hasQmark = true; 1166 sb.append('?'); 1167 ++numStartDigits; 1168 ++pos; 1169 } 1170 if (numStartDigits == 0) { 1171 break parse; 1172 } 1173 if (pos < cssLimit && css.charAt(pos) == '-') { 1174 if (!hasQmark) { 1175 // Look for end of range. 1176 ++pos; 1177 sb.append('-'); 1178 int numEndDigits = 0; 1179 while (pos < cssLimit && numEndDigits < 6) { 1180 char chLower = (char) (css.charAt(pos) | 32); 1181 if (('0' <= chLower && chLower <= '9') 1182 || ('a' <= chLower && chLower <= 'f')) { 1183 ++numEndDigits; 1184 ++pos; 1185 sb.append(chLower); 1186 } else { 1187 break; 1188 } 1189 } 1190 if (numEndDigits == 0) { 1191 // Back up over '-' 1192 --pos; 1193 sb.append(' '); 1194 } 1195 } else { 1196 sb.append(' '); 1197 } 1198 } 1199 ok = true; 1200 } finally { 1201 if (!ok) { 1202 pos = start; 1203 sb.setLength(startOfOutput); 1204 } 1205 } 1206 return ok; 1207 } 1208 1209 private @Nullable TokenType consumeIdentOrUrlOrFunction() { 1210 int bufferStart = sb.length(); 1211 int posBefore = pos; 1212 consumeIdent(false); 1213 if (pos == posBefore) { return null; } 1214 boolean parenAfter = pos < cssLimit && css.charAt(pos) == '('; 1215 if (sb.length() - bufferStart == 3 1216 && 'u' == (sb.charAt(bufferStart) | 32) 1217 && 'r' == (sb.charAt(bufferStart + 1) | 32) 1218 && 'l' == (sb.charAt(bufferStart + 2) | 32)) { 1219 if (parenAfter && consumeUrlValue()) { 1220 sb.setCharAt(bufferStart, 'u'); 1221 sb.setCharAt(bufferStart + 1, 'r'); 1222 sb.setCharAt(bufferStart + 2, 'l'); 1223 return TokenType.URL; 1224 } else { 1225 sb.setLength(bufferStart); 1226 breakOutput(); 1227 return TokenType.WHITESPACE; 1228 } 1229 } else if (parenAfter) { 1230 openBracket('('); 1231 ++pos; 1232 return TokenType.FUNCTION; 1233 } else { 1234 if (pos + 1 < cssLimit && '.' == css.charAt(pos)) { 1235 // Prevent merging of ident and number as in 1236 // border:solid.1cm black 1237 // when .1 is rewritten to 0.1 becoming 1238 // border:solid0.1cm black 1239 char next = css.charAt(pos + 1); 1240 if ('0' <= next && next <= '9') { 1241 sb.append(' '); 1242 } 1243 } 1244 return TokenType.IDENT; 1245 } 1246 } 1247 1248 private boolean consumeUrlValue() { 1249 String css = this.css; 1250 int cssLimit = this.cssLimit; 1251 if (pos == cssLimit || css.charAt(pos) != '(') { return false; } 1252 ++pos; 1253 // skip space. 1254 for (; pos < cssLimit; ++pos) { 1255 char ch = css.charAt(pos); 1256 if (ch != ' ' && !isLineTerminator(ch)) { break; } 1257 } 1258 // Find the value. 1259 int delim; 1260 if (pos < cssLimit) { 1261 char ch = pos < cssLimit ? css.charAt(pos) : '\0'; 1262 if (ch == '"' || ch == '\'') { 1263 delim = ch; 1264 ++pos; 1265 } else { 1266 delim = '\0'; 1267 } 1268 } else { 1269 return false; 1270 } 1271 sb.append("('"); 1272 while (pos < cssLimit) { 1273 int decoded = readCodepoint(); 1274 if (delim != 0) { 1275 if (decoded == delim) { 1276 ++pos; 1277 break; 1278 } 1279 } else if (decoded <= ' ' || decoded == ')') { 1280 break; 1281 } 1282 if (decoded == '\\') { 1283 decoded = consumeAndDecodeEscapeSequence(); 1284 if (decoded < 0) { 1285 return false; 1286 } 1287 } else { 1288 ++pos; 1289 } 1290 // Any character not in the RFC 3986 safe set is %-encoded. 1291 if (decoded < URL_SAFE.length && URL_SAFE[decoded]) { 1292 sb.appendCodePoint(decoded); 1293 } else if (decoded < 0x80) { 1294 sb.append('%') 1295 .append(HEX_DIGITS[(decoded >>> 4) & 0xf]) 1296 .append(HEX_DIGITS[(decoded >>> 0) & 0xf]); 1297 } else if (decoded < 0x800) { 1298 int octet0 = 0xc0 | ((decoded >>> 6) & 0x1f), 1299 octet1 = 0x80 | (decoded & 0x3f); 1300 sb.append('%') 1301 .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) 1302 .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) 1303 .append('%') 1304 .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) 1305 .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]); 1306 } else if (decoded < 0x10000) { 1307 int octet0 = 0xe0 | ((decoded >>> 12) & 0xf), 1308 octet1 = 0x80 | ((decoded >>> 6) & 0x3f), 1309 octet2 = 0x80 | (decoded & 0x3f); 1310 sb.append('%') 1311 .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) 1312 .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) 1313 .append('%') 1314 .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) 1315 .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]) 1316 .append('%') 1317 .append(HEX_DIGITS[(octet2 >>> 4) & 0xf]) 1318 .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]); 1319 } else { 1320 int octet0 = 0xf0 | ((decoded >>> 18) & 0x7), 1321 octet1 = 0x80 | ((decoded >>> 12) & 0x3f), 1322 octet2 = 0x80 | ((decoded >>> 6) & 0x3f), 1323 octet3 = 0x80 | (decoded & 0x3f); 1324 sb.append('%') 1325 .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) 1326 .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) 1327 .append('%') 1328 .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) 1329 .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]) 1330 .append('%') 1331 .append(HEX_DIGITS[(octet2 >>> 4) & 0xf]) 1332 .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]) 1333 .append('%') 1334 .append(HEX_DIGITS[(octet3 >>> 4) & 0xf]) 1335 .append(HEX_DIGITS[(octet3 >>> 0) & 0xf]); 1336 } 1337 } 1338 1339 // skip space. 1340 for (; pos < cssLimit; ++pos) { 1341 char ch = css.charAt(pos); 1342 if (ch != ' ' && !isLineTerminator(ch)) { break; } 1343 } 1344 if (pos < cssLimit && css.charAt(pos) == ')') { 1345 ++pos; 1346 } else { 1347 // broken-url 1348 } 1349 sb.append("')"); 1350 return true; 1351 } 1352 1353 /** 1354 * Reads the codepoint at pos, leaving pos at the index of the last code 1355 * unit. 1356 */ 1357 private int readCodepoint() { 1358 String css = this.css; 1359 char ch = css.charAt(pos); 1360 if (Character.isHighSurrogate(ch) && pos + 1 < cssLimit) { 1361 char next = css.charAt(pos + 1); 1362 if (Character.isLowSurrogate(next)) { 1363 ++pos; 1364 return 0x10000 + (((ch - 0xd800) << 10) | (next - 0xdc00)); 1365 } 1366 } 1367 return ch; 1368 } 1369 } 1370 1371 private static final boolean isIdentPart(int cp) { 1372 return cp >= 0x80 1373 ? Character.isDefined(cp) && cp != '\ufeff' 1374 : IDENT_PART_ASCII[cp]; 1375 } 1376 1377 private static final boolean isDecimal(char ch) { 1378 return '0' <= ch && ch <= '9'; 1379 } 1380 1381 private static final boolean[] IDENT_PART_ASCII = new boolean[128]; 1382 static { 1383 for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; } 1384 for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; } 1385 for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; } 1386 IDENT_PART_ASCII['_'] = true; 1387 IDENT_PART_ASCII['-'] = true; 1388 } 1389 1390 private static final int LINE_TERMINATOR_BITMASK = 1391 (1 << '\n') | (1 << '\r') | (1 << '\f'); 1392 1393 private static boolean isLineTerminator(char ch) { 1394 return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch)); 1395 } 1396 1397 private static int[] expandIfNecessary(int[] arr, int limit, int needed) { 1398 int neededLength = limit + needed; 1399 int length = arr.length; 1400 if (length >= neededLength) { return arr; } 1401 int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))]; 1402 System.arraycopy(arr, 0, newArr, 0, limit); 1403 return newArr; 1404 } 1405 1406 private static int[] truncateOrShare(int[] arr, int limit) { 1407 if (limit == 0) { return ZERO_INTS; } 1408 if (limit == arr.length) { 1409 return arr; 1410 } 1411 int[] trunc = new int[limit]; 1412 System.arraycopy(arr, 0, trunc, 0, limit); 1413 return trunc; 1414 } 1415 1416 private static final int LENGTH_UNIT_TYPE = 0; 1417 private static final int ANGLE_UNIT_TYPE = 1; 1418 private static final int TIME_UNIT_TYPE = 2; 1419 private static final int FREQUENCY_UNIT_TYPE = 3; 1420 private static final int RESOLUTION_UNIT_TYPE = 4; 1421 1422 /** 1423 * See http://dev.w3.org/csswg/css-values/#lengths and 1424 * http://dev.w3.org/csswg/css-values/#other-units 1425 */ 1426 private static final Trie UNIT_TRIE = new Trie( 1427 ImmutableMap.<String, Integer>builder() 1428 .put("em", LENGTH_UNIT_TYPE) 1429 .put("ex", LENGTH_UNIT_TYPE) 1430 .put("ch", LENGTH_UNIT_TYPE) // Width of zero character 1431 .put("rem", LENGTH_UNIT_TYPE) // Root element font-size 1432 .put("vh", LENGTH_UNIT_TYPE) 1433 .put("vw", LENGTH_UNIT_TYPE) 1434 .put("vmin", LENGTH_UNIT_TYPE) 1435 .put("vmax", LENGTH_UNIT_TYPE) 1436 .put("px", LENGTH_UNIT_TYPE) 1437 .put("mm", LENGTH_UNIT_TYPE) 1438 .put("cm", LENGTH_UNIT_TYPE) 1439 .put("in", LENGTH_UNIT_TYPE) 1440 .put("pt", LENGTH_UNIT_TYPE) 1441 .put("pc", LENGTH_UNIT_TYPE) 1442 .put("deg", ANGLE_UNIT_TYPE) 1443 .put("rad", ANGLE_UNIT_TYPE) 1444 .put("grad", ANGLE_UNIT_TYPE) 1445 .put("turn", ANGLE_UNIT_TYPE) 1446 .put("s", TIME_UNIT_TYPE) 1447 .put("ms", TIME_UNIT_TYPE) 1448 .put("hz", FREQUENCY_UNIT_TYPE) 1449 .put("khz", FREQUENCY_UNIT_TYPE) 1450 .put("dpi", RESOLUTION_UNIT_TYPE) 1451 .put("dpcm", RESOLUTION_UNIT_TYPE) 1452 .put("dppx", RESOLUTION_UNIT_TYPE) 1453 .build()); 1454 1455 static boolean isWellKnownUnit(CharSequence s, int start, int end) { 1456 if (start == end) { return false; } 1457 Trie t = UNIT_TRIE; 1458 for (int i = start; i < end; ++i) { 1459 char ch = s.charAt(i); 1460 t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch); 1461 if (t == null) { return false; } 1462 } 1463 return t.isTerminal(); 1464 } 1465 1466 static boolean isWellKnownUnit(CharSequence s) { 1467 return isWellKnownUnit(s, 0, s.length()); 1468 } 1469 1470 private static final boolean[] URL_SAFE = new boolean[128]; 1471 static { 1472 // From RFC 3986 1473 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 1474 for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; } 1475 for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; } 1476 for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; } 1477 URL_SAFE['-'] = true; 1478 URL_SAFE['.'] = true; 1479 URL_SAFE['_'] = true; 1480 URL_SAFE['~'] = true; 1481 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 1482 URL_SAFE[':'] = true; 1483 URL_SAFE['/'] = true; 1484 URL_SAFE['?'] = true; 1485 URL_SAFE['#'] = true; 1486 URL_SAFE['['] = true; 1487 URL_SAFE[']'] = true; 1488 URL_SAFE['@'] = true; 1489 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 1490 // / "*" / "+" / "," / ";" / "=" 1491 URL_SAFE['!'] = true; 1492 URL_SAFE['$'] = true; 1493 URL_SAFE['&'] = true; 1494 // Only used in obsolete mark rule and special in unquoted URLs or comment 1495 // delimiters. 1496 // URL_SAFE['\''] = true; 1497 // URL_SAFE['('] = true; 1498 // URL_SAFE[')'] = true; 1499 // URL_SAFE['*'] = true; 1500 URL_SAFE['+'] = true; 1501 URL_SAFE[','] = true; 1502 URL_SAFE[';'] = true; 1503 URL_SAFE['='] = true; 1504 // % is used to encode unsafe octets. 1505 URL_SAFE['%'] = true; 1506 } 1507 1508 private static final char[] HEX_DIGITS = { 1509 '0', '1', '2', '3', 1510 '4', '5', '6', '7', 1511 '8', '9', 'a', 'b', 1512 'c', 'd', 'e', 'f' 1513 }; 1514} 1515