owasp/html/CssGrammar.java

// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package org.owasp.html;

final class CssGrammar {

  private static void errorRecoveryUntilSemiOrCloseBracket(
      CssTokens.TokenIterator it) {
    int bracketDepth = 0;
    for (; it.hasNext(); it.advance()) {
      switch (it.type()) {
        case SEMICOLON:
          it.advance();
          return;
        case LEFT_CURLY:
        case LEFT_PAREN:
        case LEFT_SQUARE:
          ++bracketDepth;
          break;
        case RIGHT_CURLY:
        case RIGHT_PAREN:
        case RIGHT_SQUARE:
          --bracketDepth;
          if (bracketDepth <= 0) {
            if (bracketDepth != 0) { it.advance(); }
            return;
          }
          break;
        default:
          break;
      }
    }
  }

  static void parsePropertyGroup(String css, PropertyHandler handler) {
    // Split tokens by semicolons/curly-braces, then by first colon,
    // dropping spaces and comments to identify property names and token runs
    // that form the value.

    CssTokens tokens = CssTokens.lex(css);
    CssTokens.TokenIterator it = tokens.iterator();
    propertyNameLoop:
    while (it.hasTokenAfterSpace()) {
      // Check that we have an identifier that might be a property name.
      if (it.type() != CssTokens.TokenType.IDENT) {
        errorRecoveryUntilSemiOrCloseBracket(it);
        continue;
      }

      String name = it.next();

      // Look for a colon.
      if (!(it.hasTokenAfterSpace() && ":".equals(it.token()))) {
        errorRecoveryUntilSemiOrCloseBracket(it);
        continue propertyNameLoop;
      }
      it.advance();

      handler.startProperty(Strings.toLowerCase(name));
      parsePropertyValue(it, handler);
      handler.endProperty();
    }
  }

  private static void parsePropertyValue(
      CssTokens.TokenIterator it, PropertyHandler handler) {
    propertyValueLoop:
    while (it.hasNext()) {
      CssTokens.TokenType type = it.type();
      String token = it.token();
      switch (type) {
        case SEMICOLON:
          it.advance();
          break propertyValueLoop;
        case FUNCTION:
          CssTokens.TokenIterator actuals = it.spliceToEnd();
          handler.startFunction(token);
          parsePropertyValue(actuals, handler);
          handler.endFunction(token);
          continue;  // Skip the advance over token.
        case IDENT:
          handler.identifier(token);
          break;
        case HASH_UNRESTRICTED:
          if (token.length() == 4 || token.length() == 7) {
            handler.hash(token);
          }
          break;
        case STRING:
          handler.quotedString(token);
          break;
        case URL:
          handler.url(token);
          break;
        case DIMENSION:
        case NUMBER:
        case PERCENTAGE:
          handler.quantity(token);
          break;
        case AT:
        case BAD_DIMENSION:
        case COLUMN:
        case DOT_IDENT:
        case HASH_ID:
        case MATCH:
        case UNICODE_RANGE:
        case WHITESPACE:
          break;
        case LEFT_CURLY:
        case LEFT_PAREN:
        case LEFT_SQUARE:
        case RIGHT_CURLY:
        case RIGHT_PAREN:
        case RIGHT_SQUARE:
        case COMMA:
        case COLON:
        case DELIM:
          handler.punctuation(token);
          break;
      }
      it.advance();
    }
  }

  /**
   * Decodes any escape sequences and strips any quotes from the input.
   */
  static String cssContent(String token) {
    int n = token.length();
    int pos = 0;
    StringBuilder sb = null;
    if (n >= 2) {
      char ch0 = token.charAt(0);
      if (ch0 == '"' || ch0 == '\'') {
        if (ch0 == token.charAt(n - 1)) {
          pos = 1;
          --n;
          sb = new StringBuilder(n);
        }
      }
    }
    for (int esc; (esc = token.indexOf('\\', pos)) >= 0;) {
      int end = esc + 2;
      if (esc > n) { break; }
      if (sb == null) { sb = new StringBuilder(n); }
      sb.append(token, pos, esc);
      int codepoint = token.charAt(end - 1);
      if (isHex(codepoint)) {
        // Parse \hhhhh<opt-break> where hhhhh is one or more hex digits
        // and <opt-break> is an optional space or tab character that can be
        // used to separate an escape sequence from a following literal hex
        // digit.
        while (end < n && isHex(token.charAt(end))) { ++end; }
        try {
          codepoint = Integer.parseInt(token.substring(esc + 1, end), 16);
        } catch (RuntimeException ex) {
          codepoint = 0xfffd;  // Unknown codepoint.
        }
        if (end < n) {
          char ch = token.charAt(end);
          if (ch == ' ' || ch == '\t') {  // Ignorable hex follower.
            ++end;
          }
        }
      }
      sb.appendCodePoint(codepoint);
      pos = end;
    }
    if (sb == null) { return token; }
    return sb.append(token, pos, n).toString();
  }

  private static boolean isHex(int codepoint) {
    return ('0' <= codepoint && codepoint <= '9')
        || ('A' <= codepoint && codepoint <= 'F')
        || ('a' <= codepoint && codepoint <= 'f');
  }

  interface PropertyHandler {
    void startProperty(String propertyName);
    void quantity(String token);
    void identifier(String token);
    void hash(String token);
    void quotedString(String token);
    void url(String token);
    void punctuation(String token);
    void startFunction(String token);
    void endFunction(String token);
    void endProperty();
  }

}