// Copyright (c) 2013, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import javax.annotation.Nullable; import com.google.common.collect.ImmutableMap; /** * Given a string of CSS, produces a string of normalized CSS with certain * useful properties detailed below. * */ final class CssTokens implements Iterable { public final String normalizedCss; public final Brackets brackets; private final int[] tokenBreaks; private final TokenType[] tokenTypes; public TokenIterator start() { return new TokenIterator(tokenTypes.length); } public TokenIterator iterator() { return start(); } public static CssTokens lex(String css) { Lexer lexer = new Lexer(css); lexer.lex(); return lexer.build(); } /** A cursor into a list of tokens. */ public final class TokenIterator implements Iterator { private int tokenIndex = 0; private final int limit; TokenIterator(int limit) { this.limit = limit; } public boolean hasNext() { return hasToken(); } public String next() { String token = token(); advance(); return token; } public @Nullable TokenIterator spliceToEnd() { if (!hasNext()) { throw new NoSuchElementException(); } int end = brackets.partner(tokenIndex); if (end < 0) { return null; } TokenIterator between = new TokenIterator(end); between.tokenIndex = tokenIndex + 1; tokenIndex = end + 1; return between; } public int tokenIndex() { return tokenIndex; } public int startOffset() { return tokenBreaks[tokenIndex]; } public int endOffset() { return tokenBreaks[tokenIndex+1]; } public String token() { return normalizedCss.substring(startOffset(), endOffset()); } public boolean hasToken() { return tokenIndex < limit; } public boolean hasTokenAfterSpace() { while (hasToken()) { if (type() != TokenType.WHITESPACE) { return true; } advance(); } return false; } /** The type of the current token. */ public TokenType type() { return tokenTypes[tokenIndex]; } public void seek(int tokenIndex) { this.tokenIndex = tokenIndex; } public void advance() { if (!hasToken()) { throw new NoSuchElementException(); } ++tokenIndex; } public void backup() { if (tokenIndex == 0) { throw new NoSuchElementException(); } --tokenIndex; } public void remove() throws UnsupportedOperationException { throw new UnsupportedOperationException(); } } private CssTokens( String normalizedCss, Brackets brackets, int[] tokenBreaks, TokenType[] tokenTypes) { this.normalizedCss = normalizedCss; this.brackets = brackets; this.tokenBreaks = tokenBreaks; this.tokenTypes = tokenTypes; } public enum TokenType { /** An identifier. */ IDENT, /** An identifier prefixed with a period. */ DOT_IDENT, /** A function name and opening bracket. */ FUNCTION, /** An {@code @} directive token. */ AT, /** A hash token that contains non-hex characters. */ HASH_ID, /** A hash token that could be a color literal. */ HASH_UNRESTRICTED, /** A quoted string. */ STRING, /** A URL of the form url("..."). */ URL, /** A single character. */ DELIM, /** A scalar numeric value. */ NUMBER, /** A percentage. */ PERCENTAGE, /** A numeric value with a unit suffix. */ DIMENSION, /** A numeric value with an unknown unit suffix. */ BAD_DIMENSION, /** {@code U+} */ UNICODE_RANGE, /** * include-match, dash-match, prefix-match, suffix-match, substring-match */ MATCH, /** {@code ||} */ COLUMN, /** A run of white-space, comment, CDO, and CDC tokens. */ WHITESPACE, /** {@code :} */ COLON, /** {@code ;} */ SEMICOLON, /** {@code ,} */ COMMA, /** {@code [} */ LEFT_SQUARE, /** {@code ]} */ RIGHT_SQUARE, /** {@code (} */ LEFT_PAREN, /** {@code )} */ RIGHT_PAREN, /** { */ LEFT_CURLY, /** } */ RIGHT_CURLY, ; } /** * Maps tokens to their partners. A close bracket token like {@code (} may * have a partner token like {@code )} if properly nested, and vice-versa. */ static final class Brackets { /** * For each token index, the index of the indexed token's partner or -1 if * it has none. */ private final int[] brackets; private Brackets(int[] brackets) { this.brackets = brackets; } /** The index of the partner token or -1 if none. */ int partner(int tokenIndex) { int bracketIndex = bracketIndexForToken(tokenIndex); if (bracketIndex < 0) { return -1; } return brackets[(bracketIndex << 1) + 1]; } int bracketIndexForToken(int target) { // Binary search by leftmost element of pair. int left = 0; int right = brackets.length >> 1; while (left < right) { int mid = left + ((right - left) >> 1); int value = brackets[mid << 1]; if (value == target) { return mid; } if (value < target) { left = mid + 1; } else { right = mid; } } return -1; } } private static final int[] ZERO_INTS = new int[0]; private static final TokenType[] ZERO_TYPES = new TokenType[0]; private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS); private static final CssTokens EMPTY = new CssTokens( "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES); /** * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/ */ private static final class Lexer { private final String css; private final StringBuilder sb; private int pos = 0; private final int cssLimit; private List tokenTypes = null; private int[] tokenBreaks = new int[128]; private int tokenBreaksLimit = 0; /** * For each bracket, 2 ints: the token index of the bracket, and the token * index of its partner. * The array is sorted by the first int. * The second int is -1 when the bracket has not yet been closed. */ private int[] brackets = ZERO_INTS; /** * The number of elements in {@link #brackets} that are valid. * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow * into. */ private int bracketsLimit = 0; /** * For each bracket that has not been closed, 2 ints: * its index in {@link #brackets} and the character of its close bracket * as an int. * This is a bracket stack so the array is sorted by the first int. */ private int[] open = ZERO_INTS; /** * The number of elements in {@link #open} that are valid. * {@code open[openLimit:]} is garbage space that the stack can grow into. */ private int openLimit = 0; Lexer(String css) { this.css = css; this.sb = new StringBuilder(); this.cssLimit = css.length(); } TokenType openBracket(char bracketChar) { char close; TokenType type; switch (bracketChar) { case '(': close = ')'; type = TokenType.LEFT_PAREN; break; case '[': close = ']'; type = TokenType.LEFT_SQUARE; break; case '{': close = '}'; type = TokenType.LEFT_CURLY; break; default: throw new AssertionError("Invalid open bracket " + bracketChar); } brackets = expandIfNecessary(brackets, bracketsLimit, 2); open = expandIfNecessary(open, openLimit, 2); open[openLimit++] = bracketsLimit; open[openLimit++] = close; brackets[bracketsLimit++] = tokenBreaksLimit; brackets[bracketsLimit++] = -1; sb.append(bracketChar); return type; } void closeBracket(char bracketChar) { int openLimitAfterClose = openLimit; do { if (openLimitAfterClose == 0) { // Drop an orphaned close bracket. breakOutput(); return; } openLimitAfterClose -= 2; } while (bracketChar != open[openLimitAfterClose + 1]); closeBrackets(openLimitAfterClose); } private void closeBrackets(int openLimitAfterClose) { // Make sure we've got space on brackets. int spaceNeeded = openLimit - openLimitAfterClose; brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded); int closeTokenIndex = tokenBreaksLimit; while (openLimit > openLimitAfterClose) { // Pop the stack. int closeBracket = open[--openLimit]; int openBracketIndex = open[--openLimit]; int openTokenIndex = brackets[openBracketIndex]; // Update open bracket to point to its partner. brackets[openBracketIndex + 1] = closeTokenIndex; // Emit the close bracket. brackets[bracketsLimit++] = closeTokenIndex; brackets[bracketsLimit++] = openTokenIndex; sb.appendCodePoint(closeBracket); closeTokenIndex++; } } CssTokens build() { // Close any still open brackets. { int startOfCloseBrackets = sb.length(); closeBrackets(0); emitMergedTokens(startOfCloseBrackets, sb.length()); } if (tokenTypes == null) { return EMPTY; } int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit); // Strip any trailing space off, since it may have been inserted by a // breakAfter call anyway. int cssEnd = sb.length(); if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') { --cssEnd; tokenTypes.remove(--tokenBreaksLimit); } String normalizedCss = sb.substring(0, cssEnd); // Store the last character on the tokenBreaksList to simplify finding the // end of a token. tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); tokenBreaks[tokenBreaksLimit++] = normalizedCss.length(); int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit); TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES); return new CssTokens( normalizedCss, new Brackets(bracketsTrunc), tokenBreaksTrunc, tokenTypesArr); } void lex() { // Fast-track no content. consumeIgnorable(); sb.setLength(0); if (pos == cssLimit) { return; } tokenTypes = new ArrayList(); String css = this.css; int cssLimit = this.cssLimit; while (pos < cssLimit) { assert this.tokenBreaksLimit == this.tokenTypes.size() : "token and types out of sync at " + tokenBreaksLimit + " in `" + css + "`"; // SPEC: 4. Tokenization // The output of the tokenization step is a stream of zero // or more of the following tokens: , , // , , , , , // , , , , // , , , // , , , // , , , , // , , , , <[>, <]>, // <(>, <)>, <{>, and <}>. // IMPLEMENTS: 4.3 Consume a token char ch = css.charAt(pos); int startOfToken = pos; int startOfOutputToken = sb.length(); final TokenType type; switch (ch) { case '\t': case '\n': case '\f': case '\r': case ' ': case '\ufeff': consumeIgnorable(); type = TokenType.WHITESPACE; break; case '/': { char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; if (lookahead == '/' || lookahead == '*') { consumeIgnorable(); type = TokenType.WHITESPACE; } else { consumeDelim(ch); type = TokenType.DELIM; } break; } case '<': if (consumeIgnorable()) { // type = TokenType.WHITESPACE; } else { type = consumeIdentOrUrlOrFunction(); } } else if (isIdentPart(lookahead)) { // treat "." as one token. sb.append('.'); ++pos; consumeIdent(false); if (pos != startOfToken + 1) { type = TokenType.DOT_IDENT; if (pos < cssLimit) { char next = css.charAt(pos); if ('(' == next) { // A dotted identifier followed by a parenthesis is // ambiguously a function. sb.append(' '); } } } else { type = TokenType.DELIM; sb.append(' '); } } else { consumeDelim('.'); type = TokenType.DELIM; } break; } case ':': consumeDelim(ch); type = TokenType.COLON; break; case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break; case ',': consumeDelim(ch); type = TokenType.COMMA; break; case '[': case '(': case '{': type = openBracket(ch); ++pos; break; case '}': case ')': case ']': closeBracket(ch); ++pos; // Use DELIM so that a later loop will split output into multiple // tokens since we may have inserted missing close brackets for // unclosed open brackets already on the stack. type = TokenType.DELIM; break; case '~': case '|': case '^': case '$': case '*': { char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; if (lookahead == '=') { consumeMatch(ch); type = TokenType.MATCH; } else if (ch == '|' && lookahead == '|') { consumeColumn(); type = TokenType.COLUMN; } else { consumeDelim(ch); type = TokenType.DELIM; } break; } case '_': type = consumeIdentOrUrlOrFunction(); break; case '\\': { // Optimistically parse as an ident. TokenType identType = consumeIdentOrUrlOrFunction(); if (identType == null) { ++pos; // drop breakOutput(); type = TokenType.WHITESPACE; } else { type = identType; } // TODO: handle case where "url" is encoded. break; } default: int chlower = ch | 32; if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) { TokenType identType = consumeIdentOrUrlOrFunction(); if (identType != null) { type = identType; } else { // Occurs on undefined-codepoints. ++pos; breakOutput(); type = TokenType.WHITESPACE; } } else if (ch > 0x20) { consumeDelim(ch); type = TokenType.DELIM; } else { // Ignore. consumeIgnorable(); type = TokenType.WHITESPACE; } } assert pos > startOfToken : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken) + ":U+" + Integer.toHexString(css.charAt(startOfToken)); int endOfOutputToken = sb.length(); if (endOfOutputToken > startOfOutputToken) { if (type == TokenType.DELIM) { emitMergedTokens(startOfOutputToken, endOfOutputToken); } else { if (type != TokenType.WHITESPACE && sb.charAt(startOfOutputToken) == ' ') { emitToken(TokenType.WHITESPACE, startOfOutputToken); ++startOfOutputToken; assert startOfOutputToken != endOfOutputToken; } emitToken(type, startOfOutputToken); // Token emitters can emit a space after a token to avoid possible // merges with following tokens if (type != TokenType.WHITESPACE) { int sbLen = sb.length(); if (startOfOutputToken + 1 < sbLen && sb.charAt(sbLen - 1) == ' ') { emitToken(TokenType.WHITESPACE, sbLen - 1); } } } } } } private void emitMergedTokens(int start, int end) { // Handle breakOutput and merging of output tokens. for (int e = start; e < end; ++e) { TokenType delimType; switch (sb.charAt(e)) { case ' ': delimType = TokenType.WHITESPACE; break; case '}': delimType = TokenType.RIGHT_CURLY; break; case ')': delimType = TokenType.RIGHT_PAREN; break; case ']': delimType = TokenType.RIGHT_SQUARE; break; default : delimType = TokenType.DELIM; break; } emitToken(delimType, e); } } private void emitToken(TokenType type, int startOfOutputToken) { if (tokenBreaksLimit == 0 || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) { tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); tokenBreaks[tokenBreaksLimit++] = startOfOutputToken; tokenTypes.add(type); } } private void consumeDelim(char ch) { sb.append(ch); switch (ch) { // Prevent token merging. case '~': case '|': case '^': case '$': case '\\': case '.': case '+': case '-': case '@': case '/': case '<': sb.append(' '); break; default: break; } ++pos; } private boolean consumeIgnorable() { String css = this.css; int cssLimit = this.cssLimit; int posBefore = pos; while (pos < cssLimit) { char ch = css.charAt(pos); if (ch <= 0x20 // Treat a BOM as white-space so that it is ignored at the beginning // of a file. || ch == '\ufeff') { ++pos; } else if (pos + 1 == cssLimit) { break; } else if (ch == '/') { char next = css.charAt(pos + 1); if (next == '*') { pos += 2; while (pos < cssLimit) { int ast = css.indexOf('*', pos); if (ast < 0) { pos = cssLimit; // Unclosed /* comment */ break; } else { // Advance over a run of '*'s. pos = ast + 1; while (pos < cssLimit && css.charAt(pos) == '*') { ++pos; } if (pos < cssLimit && css.charAt(pos) == '/') { ++pos; break; } } } } else if (next == '/') { // Non-standard but widely supported while (++pos < cssLimit) { if (isLineTerminator(css.charAt(pos))) { break; } } } else { break; } } else if (ch == '<') { if (pos + 3 < cssLimit && '!' == css.charAt(pos + 1) && '-' == css.charAt(pos + 2) && '-' == css.charAt(pos + 3)) { pos += 4; } else { break; } } else if (ch == '-') { if (pos + 2 < cssLimit && '-' == css.charAt(pos + 1) && '>' == css.charAt(pos + 2)) { pos += 3; } else { break; } } else { break; } } if (pos == posBefore) { return false; } else { breakOutput(); return true; } } private void breakOutput() { int last = sb.length() - 1; if (last >= 0 && sb.charAt(last) != ' ') { sb.append(' '); } } private void consumeColumn() { pos += 2; sb.append("||"); } private void consumeMatch(char ch) { pos += 2; sb.append(ch).append('='); } private void consumeIdent(boolean allowFirstDigit) { int cssLimit = this.cssLimit; int last = -1, nCodepoints = 0; int sbAtStart = sb.length(); int posAtStart = pos; while (pos < cssLimit) { int posBefore = pos; int decoded = readCodepoint(); if (decoded == '\\') { decoded = consumeAndDecodeEscapeSequence(); } else { ++pos; } if (decoded >= 0 && isIdentPart(decoded)) { if (!allowFirstDigit && nCodepoints < 2 && '0' <= decoded && decoded <= '9') { // Don't allow encoded identifiers that look like numeric tokens // like \-1 or ones that start with an encoded decimal digit. if (last == '-' || last == -1) { pos = posAtStart; sb.setLength(sbAtStart); return; } } sb.appendCodePoint(decoded); last = decoded; ++nCodepoints; } else { pos = posBefore; return; } } } private boolean consumeAtKeyword() { assert css.charAt(pos) == '@'; int bufferLengthBeforeWrite = sb.length(); sb.append('@'); int posBeforeKeyword = ++pos; consumeIdent(false); if (pos == posBeforeKeyword) { --pos; // back up over '@' sb.setLength(bufferLengthBeforeWrite); // Unwrite the '@' return false; } else { return true; } } private int consumeAndDecodeEscapeSequence() { String css = this.css; int cssLimit = this.cssLimit; assert css.charAt(pos) == '\\'; if (pos + 1 >= cssLimit) { return -1; } char esc = css.charAt(pos + 1); if (isLineTerminator(esc)) { return -1; } int escLower = esc | 32; if (('0' <= esc && esc <= '9') || ('a' <= escLower && escLower <= 'f')) { int hexValue = 0; int hexStart = pos + 1; int hexLimit = Math.min(pos + 7, cssLimit); int hexEnd = hexStart; do { hexValue = (hexValue << 4) | (esc <= '9' ? esc - '0' : escLower - ('a' - 10)); ++hexEnd; if (hexEnd == hexLimit) { break; } esc = css.charAt(hexEnd); escLower = esc | 32; } while (('0' <= esc && esc <= '9') || ('a' <= escLower && escLower <= 'f')); if (!Character.isDefined(hexValue)) { hexValue = 0xfffd; } pos = hexEnd; if (pos < cssLimit) { // A sequence of hex digits can be followed by a space that allows // so that code-point U+A followed by the letter 'b' can be rendered // as "\a b" since "\ab" specifies the single code-point U+AB. char next = css.charAt(pos); if (next == ' ' || next == '\t' || isLineTerminator(next)) { ++pos; } } return hexValue; } pos += 2; return esc; } private static final long HEX_ENCODED_BITMASK = (1L << 0) | LINE_TERMINATOR_BITMASK | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>'); private static boolean isHexEncoded(int codepoint) { return (0 <= codepoint && codepoint < 63 && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK)); } private void encodeCharOntoOutput(int codepoint, int last) { switch (codepoint) { case '\\': sb.append("\\\\"); break; case '\0': sb.append("\\0"); break; case '\n': sb.append("\\a"); break; case '\f': sb.append("\\c"); break; case '\r': sb.append("\\d"); break; case '\"': sb.append("\\22"); break; case '&': sb.append("\\26"); break; case '\'': sb.append("\\27"); break; case '<': sb.append("\\3c"); break; case '>': sb.append("\\3e"); break; // The set of escapes above that end with a hex digit must appear in // HEX_ENCODED_BITMASK. case '-': sb.append('-'); break; default: if (isHexEncoded(last) // We need to put a space after a trailing hex digit if the // next encoded character on the output would be another hex // digit or a space character. The other space characters // are handled above. && (codepoint == ' ' || codepoint == '\t' || ('0' <= codepoint && codepoint <= '9') || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) { sb.append(' '); } sb.appendCodePoint(codepoint); break; } } private TokenType consumeNumberOrPercentageOrDimension() { String css = this.css; int cssLimit = this.cssLimit; boolean isZero = true; int intStart = pos; if (intStart < cssLimit) { char ch = css.charAt(intStart); if (ch == '-' || ch == '+') { ++intStart; } } // Find the integer part after any sign. int intEnd = intStart; for (; intEnd < cssLimit; ++intEnd) { char ch = css.charAt(intEnd); if (!('0' <= ch && ch <= '9')) { break; } if (ch != '0') { isZero = false; } } // Find a fraction like ".5" or ".". int fractionStart = intEnd; int fractionEnd = fractionStart; if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) { ++fractionEnd; for (; fractionEnd < cssLimit; ++fractionEnd) { char ch = css.charAt(fractionEnd); if (!('0' <= ch && ch <= '9')) { break; } if (ch != '0') { isZero = false; } } } int exponentStart = fractionEnd; int exponentIntStart = exponentStart; int exponentEnd = exponentStart; boolean isExponentZero = true; if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) { // 'e' and 'e' in "5e-f" for a exponentEnd = exponentStart + 1; if (exponentEnd < cssLimit) { char ch = css.charAt(exponentEnd); if (ch == '+' || ch == '-') { ++exponentEnd; } } exponentIntStart = exponentEnd; for (; exponentEnd < cssLimit; ++exponentEnd) { char ch = css.charAt(exponentEnd); if (!('0' <= ch && ch <= '9')) { break; } if (ch != '0') { isExponentZero = false; } } // Since // dimension := // the below are technically valid dimensions even though they appear // to have incomplete exponents: // 5e // 5ex // 5e- if (exponentEnd == exponentIntStart) { // Incomplete exponent. exponentIntStart = exponentEnd = exponentStart; isExponentZero = true; } } int unitStart = exponentEnd; // Skip over space between number and unit. // Many user-agents allow "5 ex" instead of "5ex". while (unitStart < cssLimit) { char ch = css.charAt(unitStart); if (ch == ' ' || isLineTerminator(ch)) { ++unitStart; } else { break; } } if (sb.length() != 0 && isIdentPart(sb.charAt(sb.length() - 1))) { sb.append(' '); } // Normalize the number onto the buffer. // We will normalize and unit later. // Skip the sign if it is positive. if (intStart != pos && '-' == css.charAt(pos) && !isZero) { sb.append('-'); } if (isZero) { sb.append('0'); } else { // Strip leading zeroes from the integer and exponent and trailing // zeroes from the fraction. while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; } while (fractionEnd > fractionStart && css.charAt(fractionEnd - 1) == '0') { --fractionEnd; } if (intStart == intEnd) { sb.append('0'); // .5 -> 0.5 } else { sb.append(css, intStart, intEnd); } if (fractionEnd > fractionStart + 1) { // 5. -> 5; 5.0 -> 5 sb.append(css, fractionStart, fractionEnd); } if (!isExponentZero) { sb.append('e'); // 1e+1 -> 1e1 if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); } while (exponentIntStart < exponentEnd && css.charAt(exponentIntStart) == '0') { ++exponentIntStart; } sb.append(css, exponentIntStart, exponentEnd); } } int unitEnd; TokenType type; if (unitStart < cssLimit && '%' == css.charAt(unitStart)) { unitEnd = unitStart + 1; type = TokenType.PERCENTAGE; sb.append('%'); } else { // The grammar says that any identifier following a number is a unit. int bufferBeforeUnit = sb.length(); pos = unitStart; consumeIdent(false); int bufferAfterUnit = sb.length(); boolean knownUnit = isWellKnownUnit( sb, bufferBeforeUnit, bufferAfterUnit); if (unitStart == exponentEnd // No intervening space || knownUnit) { unitEnd = pos; // 3IN -> 3in for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) { char ch = sb.charAt(i); if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); } } } else { unitEnd = unitStart = exponentEnd; sb.setLength(bufferBeforeUnit); } type = unitStart == unitEnd ? TokenType.NUMBER : knownUnit ? TokenType.DIMENSION : TokenType.BAD_DIMENSION; } pos = unitEnd; if (type != TokenType.PERCENTAGE && pos < cssLimit && css.charAt(pos) == '.') { sb.append(' '); } return type; } private TokenType consumeString() { String css = this.css; int cssLimit = this.cssLimit; char delim = css.charAt(pos); assert delim == '"' || delim == '\''; ++pos; int startOfStringOnOutput = sb.length(); sb.append('\''); int last = -1; boolean closed = false; while (pos < cssLimit) { char ch = css.charAt(pos); if (ch == delim) { ++pos; closed = true; break; } if (isLineTerminator(ch)) { break; } int decoded = ch; if (ch == '\\') { if (pos + 1 < cssLimit && isLineTerminator(css.charAt(pos+1))) { // consume it but generate no tokens. // Lookahead to treat a \r\n sequence as one line-terminator. if (pos + 2 < cssLimit && css.charAt(pos+1) == '\r' && css.charAt(pos+2) == '\n') { pos += 3; } else { pos += 2; } continue; } else { decoded = consumeAndDecodeEscapeSequence(); if (decoded < 0) { break; } } } else { ++pos; } encodeCharOntoOutput(decoded, last); last = decoded; } if (closed) { sb.append('\''); return TokenType.STRING; } else { // Drop s sb.setLength(startOfStringOnOutput); breakOutput(); return TokenType.WHITESPACE; } } private @Nullable TokenType consumeHash() { assert css.charAt(pos) == '#'; ++pos; int beforeIdent = pos; consumeIdent(true); if (pos == beforeIdent) { pos = beforeIdent - 1; return null; } for (int i = beforeIdent; i < pos; ++i) { char chLower = (char) (css.charAt(i) | 32); if (!(('0' <= chLower && chLower <= '9') || ('a' <= chLower && chLower <= 'f'))) { return TokenType.HASH_ID; } } return TokenType.HASH_UNRESTRICTED; } private boolean consumeUnicodeRange() { final String css = this.css; final int cssLimit = this.cssLimit; assert pos < cssLimit && (css.charAt(pos) | 32) == 'u'; final int start = pos; final int startOfOutput = sb.length(); ++pos; boolean ok = false; parse: try { if (pos == cssLimit || css.charAt(pos) != '+') { break parse; } ++pos; sb.append("U+"); int numStartDigits = 0; while (pos < cssLimit && numStartDigits < 6) { char chLower = (char) (css.charAt(pos) | 32); if (('0' <= chLower && chLower <= '9') || ('a' <= chLower && chLower <= 'f')) { sb.append(chLower); ++numStartDigits; ++pos; } else { break; } } if (numStartDigits == 0) { break parse; } boolean hasQmark = false; while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') { hasQmark = true; sb.append('?'); ++numStartDigits; ++pos; } if (numStartDigits == 0) { break parse; } if (pos < cssLimit && css.charAt(pos) == '-') { if (!hasQmark) { // Look for end of range. ++pos; sb.append('-'); int numEndDigits = 0; while (pos < cssLimit && numEndDigits < 6) { char chLower = (char) (css.charAt(pos) | 32); if (('0' <= chLower && chLower <= '9') || ('a' <= chLower && chLower <= 'f')) { ++numEndDigits; ++pos; sb.append(chLower); } else { break; } } if (numEndDigits == 0) { // Back up over '-' --pos; sb.append(' '); } } else { sb.append(' '); } } ok = true; } finally { if (!ok) { pos = start; sb.setLength(startOfOutput); } } return ok; } private @Nullable TokenType consumeIdentOrUrlOrFunction() { int bufferStart = sb.length(); int posBefore = pos; consumeIdent(false); if (pos == posBefore) { return null; } boolean parenAfter = pos < cssLimit && css.charAt(pos) == '('; if (sb.length() - bufferStart == 3 && 'u' == (sb.charAt(bufferStart) | 32) && 'r' == (sb.charAt(bufferStart + 1) | 32) && 'l' == (sb.charAt(bufferStart + 2) | 32)) { if (parenAfter && consumeUrlValue()) { sb.setCharAt(bufferStart, 'u'); sb.setCharAt(bufferStart + 1, 'r'); sb.setCharAt(bufferStart + 2, 'l'); return TokenType.URL; } else { sb.setLength(bufferStart); breakOutput(); return TokenType.WHITESPACE; } } else if (parenAfter) { openBracket('('); ++pos; return TokenType.FUNCTION; } else { if (pos + 1 < cssLimit && '.' == css.charAt(pos)) { // Prevent merging of ident and number as in // border:solid.1cm black // when .1 is rewritten to 0.1 becoming // border:solid0.1cm black char next = css.charAt(pos + 1); if ('0' <= next && next <= '9') { sb.append(' '); } } return TokenType.IDENT; } } private boolean consumeUrlValue() { String css = this.css; int cssLimit = this.cssLimit; if (pos == cssLimit || css.charAt(pos) != '(') { return false; } ++pos; // skip space. for (; pos < cssLimit; ++pos) { char ch = css.charAt(pos); if (ch != ' ' && !isLineTerminator(ch)) { break; } } // Find the value. int delim; if (pos < cssLimit) { char ch = pos < cssLimit ? css.charAt(pos) : '\0'; if (ch == '"' || ch == '\'') { delim = ch; ++pos; } else { delim = '\0'; } } else { return false; } sb.append("('"); while (pos < cssLimit) { int decoded = readCodepoint(); if (delim != 0) { if (decoded == delim) { ++pos; break; } } else if (decoded <= ' ' || decoded == ')') { break; } if (decoded == '\\') { decoded = consumeAndDecodeEscapeSequence(); if (decoded < 0) { return false; } } else { ++pos; } // Any character not in the RFC 3986 safe set is %-encoded. if (decoded < URL_SAFE.length && URL_SAFE[decoded]) { sb.appendCodePoint(decoded); } else if (decoded < 0x80) { sb.append('%') .append(HEX_DIGITS[(decoded >>> 4) & 0xf]) .append(HEX_DIGITS[(decoded >>> 0) & 0xf]); } else if (decoded < 0x800) { int octet0 = 0xc0 | ((decoded >>> 6) & 0x1f), octet1 = 0x80 | (decoded & 0x3f); sb.append('%') .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]); } else if (decoded < 0x10000) { int octet0 = 0xe0 | ((decoded >>> 12) & 0xf), octet1 = 0x80 | ((decoded >>> 6) & 0x3f), octet2 = 0x80 | (decoded & 0x3f); sb.append('%') .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet2 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]); } else { int octet0 = 0xf0 | ((decoded >>> 18) & 0x7), octet1 = 0x80 | ((decoded >>> 12) & 0x3f), octet2 = 0x80 | ((decoded >>> 6) & 0x3f), octet3 = 0x80 | (decoded & 0x3f); sb.append('%') .append(HEX_DIGITS[(octet0 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet0 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet1 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet2 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]) .append('%') .append(HEX_DIGITS[(octet3 >>> 4) & 0xf]) .append(HEX_DIGITS[(octet3 >>> 0) & 0xf]); } } // skip space. for (; pos < cssLimit; ++pos) { char ch = css.charAt(pos); if (ch != ' ' && !isLineTerminator(ch)) { break; } } if (pos < cssLimit && css.charAt(pos) == ')') { ++pos; } else { // broken-url } sb.append("')"); return true; } /** * Reads the codepoint at pos, leaving pos at the index of the last code * unit. */ private int readCodepoint() { String css = this.css; char ch = css.charAt(pos); if (Character.isHighSurrogate(ch) && pos + 1 < cssLimit) { char next = css.charAt(pos + 1); if (Character.isLowSurrogate(next)) { ++pos; return 0x10000 + (((ch - 0xd800) << 10) | (next - 0xdc00)); } } return ch; } } private static final boolean isIdentPart(int cp) { return cp >= 0x80 ? Character.isDefined(cp) && cp != '\ufeff' : IDENT_PART_ASCII[cp]; } private static final boolean isDecimal(char ch) { return '0' <= ch && ch <= '9'; } private static final boolean[] IDENT_PART_ASCII = new boolean[128]; static { for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; } for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; } for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; } IDENT_PART_ASCII['_'] = true; IDENT_PART_ASCII['-'] = true; } private static final int LINE_TERMINATOR_BITMASK = (1 << '\n') | (1 << '\r') | (1 << '\f'); private static boolean isLineTerminator(char ch) { return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch)); } private static int[] expandIfNecessary(int[] arr, int limit, int needed) { int neededLength = limit + needed; int length = arr.length; if (length >= neededLength) { return arr; } int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))]; System.arraycopy(arr, 0, newArr, 0, limit); return newArr; } private static int[] truncateOrShare(int[] arr, int limit) { if (limit == 0) { return ZERO_INTS; } if (limit == arr.length) { return arr; } int[] trunc = new int[limit]; System.arraycopy(arr, 0, trunc, 0, limit); return trunc; } private static final int LENGTH_UNIT_TYPE = 0; private static final int ANGLE_UNIT_TYPE = 1; private static final int TIME_UNIT_TYPE = 2; private static final int FREQUENCY_UNIT_TYPE = 3; private static final int RESOLUTION_UNIT_TYPE = 4; /** * See http://dev.w3.org/csswg/css-values/#lengths and * http://dev.w3.org/csswg/css-values/#other-units */ private static final Trie UNIT_TRIE = new Trie( ImmutableMap.builder() .put("em", LENGTH_UNIT_TYPE) .put("ex", LENGTH_UNIT_TYPE) .put("ch", LENGTH_UNIT_TYPE) // Width of zero character .put("rem", LENGTH_UNIT_TYPE) // Root element font-size .put("vh", LENGTH_UNIT_TYPE) .put("vw", LENGTH_UNIT_TYPE) .put("vmin", LENGTH_UNIT_TYPE) .put("vmax", LENGTH_UNIT_TYPE) .put("px", LENGTH_UNIT_TYPE) .put("mm", LENGTH_UNIT_TYPE) .put("cm", LENGTH_UNIT_TYPE) .put("in", LENGTH_UNIT_TYPE) .put("pt", LENGTH_UNIT_TYPE) .put("pc", LENGTH_UNIT_TYPE) .put("deg", ANGLE_UNIT_TYPE) .put("rad", ANGLE_UNIT_TYPE) .put("grad", ANGLE_UNIT_TYPE) .put("turn", ANGLE_UNIT_TYPE) .put("s", TIME_UNIT_TYPE) .put("ms", TIME_UNIT_TYPE) .put("hz", FREQUENCY_UNIT_TYPE) .put("khz", FREQUENCY_UNIT_TYPE) .put("dpi", RESOLUTION_UNIT_TYPE) .put("dpcm", RESOLUTION_UNIT_TYPE) .put("dppx", RESOLUTION_UNIT_TYPE) .build()); static boolean isWellKnownUnit(CharSequence s, int start, int end) { if (start == end) { return false; } Trie t = UNIT_TRIE; for (int i = start; i < end; ++i) { char ch = s.charAt(i); t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch); if (t == null) { return false; } } return t.isTerminal(); } static boolean isWellKnownUnit(CharSequence s) { return isWellKnownUnit(s, 0, s.length()); } private static final boolean[] URL_SAFE = new boolean[128]; static { // From RFC 3986 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; } for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; } for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; } URL_SAFE['-'] = true; URL_SAFE['.'] = true; URL_SAFE['_'] = true; URL_SAFE['~'] = true; // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" URL_SAFE[':'] = true; URL_SAFE['/'] = true; URL_SAFE['?'] = true; URL_SAFE['#'] = true; URL_SAFE['['] = true; URL_SAFE[']'] = true; URL_SAFE['@'] = true; // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" // / "*" / "+" / "," / ";" / "=" URL_SAFE['!'] = true; URL_SAFE['$'] = true; URL_SAFE['&'] = true; // Only used in obsolete mark rule and special in unquoted URLs or comment // delimiters. // URL_SAFE['\''] = true; // URL_SAFE['('] = true; // URL_SAFE[')'] = true; // URL_SAFE['*'] = true; URL_SAFE['+'] = true; URL_SAFE[','] = true; URL_SAFE[';'] = true; URL_SAFE['='] = true; // % is used to encode unsafe octets. URL_SAFE['%'] = true; } private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; }