WebCore/xml/XPathParser.cpp

/*
 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
 * Copyright (C) 2006 Apple Computer, Inc.
 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "XPathParser.h"

#if ENABLE(XPATH)

#include "ExceptionCode.h"
#include "XPathEvaluator.h"
#include "XPathException.h"
#include "XPathNSResolver.h"
#include "XPathPath.h"
#include "XPathStep.h"
#include <wtf/StdLibExtras.h>
#include <wtf/text/StringHash.h>

using namespace WebCore;
using namespace WTF;
using namespace Unicode;
using namespace XPath;

extern int xpathyyparse(WebCore::XPath::Parser*);
#include "XPathGrammar.h"

Parser* Parser::currentParser = 0;

enum XMLCat { NameStart, NameCont, NotPartOfName };

typedef HashMap<String, Step::Axis> AxisNamesMap;

static XMLCat charCat(UChar aChar)
{
    //### might need to add some special cases from the XML spec.

    if (aChar == '_')
        return NameStart;

    if (aChar == '.' || aChar == '-')
        return NameCont;
    CharCategory category = Unicode::category(aChar);
    if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
        return NameStart;
    if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
        return NameCont;
    return NotPartOfName;
}

static void setUpAxisNamesMap(AxisNamesMap& axisNames)
{
    struct AxisName {
        const char* name;
        Step::Axis axis;
    };
    const AxisName axisNameList[] = {
        { "ancestor", Step::AncestorAxis },
        { "ancestor-or-self", Step::AncestorOrSelfAxis },
        { "attribute", Step::AttributeAxis },
        { "child", Step::ChildAxis },
        { "descendant", Step::DescendantAxis },
        { "descendant-or-self", Step::DescendantOrSelfAxis },
        { "following", Step::FollowingAxis },
        { "following-sibling", Step::FollowingSiblingAxis },
        { "namespace", Step::NamespaceAxis },
        { "parent", Step::ParentAxis },
        { "preceding", Step::PrecedingAxis },
        { "preceding-sibling", Step::PrecedingSiblingAxis },
        { "self", Step::SelfAxis }
    };
    for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
        axisNames.set(axisNameList[i].name, axisNameList[i].axis);
}

static bool isAxisName(const String& name, Step::Axis& type)
{
    DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());

    if (axisNames.isEmpty())
        setUpAxisNamesMap(axisNames);

    AxisNamesMap::iterator it = axisNames.find(name);
    if (it == axisNames.end())
        return false;
    type = it->second;
    return true;
}

static bool isNodeTypeName(const String& name)
{
    DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
    if (nodeTypeNames.isEmpty()) {
        nodeTypeNames.add("comment");
        nodeTypeNames.add("text");
        nodeTypeNames.add("processing-instruction");
        nodeTypeNames.add("node");
    }
    return nodeTypeNames.contains(name);
}

// Returns whether the current token can possibly be a binary operator, given
// the previous token. Necessary to disambiguate some of the operators
// (* (multiply), div, and, or, mod) in the [32] Operator rule
// (check http://www.w3.org/TR/xpath#exprlex).
bool Parser::isBinaryOperatorContext() const
{
    switch (m_lastTokenType) {
    case 0:
    case '@': case AXISNAME: case '(': case '[': case ',':
    case AND: case OR: case MULOP:
    case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
    case EQOP: case RELOP:
        return false;
    default:
        return true;
    }
}

void Parser::skipWS()
{
    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
        ++m_nextPos;
}

Token Parser::makeTokenAndAdvance(int code, int advance)
{
    m_nextPos += advance;
    return Token(code);
}

Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
{
    m_nextPos += advance;
    return Token(code, val);
}

Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
{
    m_nextPos += advance;
    return Token(code, val);
}

// Returns next char if it's there and interesting, 0 otherwise
char Parser::peekAheadHelper()
{
    if (m_nextPos + 1 >= m_data.length())
        return 0;
    UChar next = m_data[m_nextPos + 1];
    if (next >= 0xff)
        return 0;
    return next;
}

char Parser::peekCurHelper()
{
    if (m_nextPos >= m_data.length())
        return 0;
    UChar next = m_data[m_nextPos];
    if (next >= 0xff)
        return 0;
    return next;
}

Token Parser::lexString()
{
    UChar delimiter = m_data[m_nextPos];
    int startPos = m_nextPos + 1;

    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
        if (m_data[m_nextPos] == delimiter) {
            String value = m_data.substring(startPos, m_nextPos - startPos);
            if (value.isNull())
                value = "";
            ++m_nextPos; // Consume the char.
            return Token(LITERAL, value);
        }
    }

    // Ouch, went off the end -- report error.
    return Token(XPATH_ERROR);
}

Token Parser::lexNumber()
{
    int startPos = m_nextPos;
    bool seenDot = false;

    // Go until end or a non-digits character.
    for (; m_nextPos < m_data.length(); ++m_nextPos) {
        UChar aChar = m_data[m_nextPos];
        if (aChar >= 0xff) break;

        if (aChar < '0' || aChar > '9') {
            if (aChar == '.' && !seenDot)
                seenDot = true;
            else
                break;
        }
    }

    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
}

bool Parser::lexNCName(String& name)
{
    int startPos = m_nextPos;
    if (m_nextPos >= m_data.length())
        return false;

    if (charCat(m_data[m_nextPos]) != NameStart)
        return false;

    // Keep going until we get a character that's not good for names.
    for (; m_nextPos < m_data.length(); ++m_nextPos)
        if (charCat(m_data[m_nextPos]) == NotPartOfName)
            break;

    name = m_data.substring(startPos, m_nextPos - startPos);
    return true;
}

bool Parser::lexQName(String& name)
{
    String n1;
    if (!lexNCName(n1))
        return false;

    skipWS();

    // If the next character is :, what we just got it the prefix, if not,
    // it's the whole thing.
    if (peekAheadHelper() != ':') {
        name = n1;
        return true;
    }

    String n2;
    if (!lexNCName(n2))
        return false;

    name = n1 + ":" + n2;
    return true;
}

Token Parser::nextTokenInternal()
{
    skipWS();

    if (m_nextPos >= m_data.length())
        return Token(0);

    char code = peekCurHelper();
    switch (code) {
    case '(': case ')': case '[': case ']':
    case '@': case ',': case '|':
        return makeTokenAndAdvance(code);
    case '\'':
    case '\"':
        return lexString();
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        return lexNumber();
    case '.': {
        char next = peekAheadHelper();
        if (next == '.')
            return makeTokenAndAdvance(DOTDOT, 2);
        if (next >= '0' && next <= '9')
            return lexNumber();
        return makeTokenAndAdvance('.');
    }
    case '/':
        if (peekAheadHelper() == '/')
            return makeTokenAndAdvance(SLASHSLASH, 2);
        return makeTokenAndAdvance('/');
    case '+':
        return makeTokenAndAdvance(PLUS);
    case '-':
        return makeTokenAndAdvance(MINUS);
    case '=':
        return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
    case '!':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
        return Token(XPATH_ERROR);
    case '<':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
        return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
    case '>':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
        return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
    case '*':
        if (isBinaryOperatorContext())
            return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
        ++m_nextPos;
        return Token(NAMETEST, "*");
    case '$': { // $ QName
        m_nextPos++;
        String name;
        if (!lexQName(name))
            return Token(XPATH_ERROR);
        return Token(VARIABLEREFERENCE, name);
    }
    }

    String name;
    if (!lexNCName(name))
        return Token(XPATH_ERROR);

    skipWS();
    // If we're in an operator context, check for any operator names
    if (isBinaryOperatorContext()) {
        if (name == "and") //### hash?
            return Token(AND);
        if (name == "or")
            return Token(OR);
        if (name == "mod")
            return Token(MULOP, NumericOp::OP_Mod);
        if (name == "div")
            return Token(MULOP, NumericOp::OP_Div);
    }

    // See whether we are at a :
    if (peekCurHelper() == ':') {
        m_nextPos++;
        // Any chance it's an axis name?
        if (peekCurHelper() == ':') {
            m_nextPos++;

            //It might be an axis name.
            Step::Axis axis;
            if (isAxisName(name, axis))
                return Token(AXISNAME, axis);
            // Ugh, :: is only valid in axis names -> error
            return Token(XPATH_ERROR);
        }

        // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
        skipWS();
        if (peekCurHelper() == '*') {
            m_nextPos++;
            return Token(NAMETEST, name + ":*");
        }

        // Make a full qname.
        String n2;
        if (!lexNCName(n2))
            return Token(XPATH_ERROR);

        name = name + ":" + n2;
    }

    skipWS();
    if (peekCurHelper() == '(') {
        //note: we don't swallow the (here!

        //either node type of function name
        if (isNodeTypeName(name)) {
            if (name == "processing-instruction")
                return Token(PI, name);

            return Token(NODETYPE, name);
        }
        //must be a function name.
        return Token(FUNCTIONNAME, name);
    }

    // At this point, it must be NAMETEST.
    return Token(NAMETEST, name);
}

Token Parser::nextToken()
{
    Token toRet = nextTokenInternal();
    m_lastTokenType = toRet.type;
    return toRet;
}

Parser::Parser()
{
    reset(String());
}

Parser::~Parser()
{
}

void Parser::reset(const String& data)
{
    m_nextPos = 0;
    m_data = data;
    m_lastTokenType = 0;

    m_topExpr = 0;
    m_gotNamespaceError = false;
}

int Parser::lex(void* data)
{
    YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
    Token tok = nextToken();

    switch (tok.type) {
    case AXISNAME:
        yylval->axis = tok.axis;
        break;
    case MULOP:
        yylval->numop = tok.numop;
        break;
    case RELOP:
    case EQOP:
        yylval->eqop = tok.eqop;
        break;
    case NODETYPE:
    case PI:
    case FUNCTIONNAME:
    case LITERAL:
    case VARIABLEREFERENCE:
    case NUMBER:
    case NAMETEST:
        yylval->str = new String(tok.str);
        registerString(yylval->str);
        break;
    }

    return tok.type;
}

bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
{
    size_t colon = qName.find(':');
    if (colon != notFound) {
        if (!m_resolver)
            return false;
        namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
        if (namespaceURI.isNull())
            return false;
        localName = qName.substring(colon + 1);
    } else
        localName = qName;

    return true;
}

Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
{
    reset(statement);

    m_resolver = resolver;

    Parser* oldParser = currentParser;
    currentParser = this;
    int parseError = xpathyyparse(this);
    currentParser = oldParser;

    if (parseError) {
        deleteAllValues(m_parseNodes);
        m_parseNodes.clear();

        HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
        for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
            deleteAllValues(**it);
            delete *it;
        }
        m_predicateVectors.clear();

        HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
        for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
            deleteAllValues(**it);
            delete *it;
        }
        m_expressionVectors.clear();

        deleteAllValues(m_strings);
        m_strings.clear();

        deleteAllValues(m_nodeTests);
        m_nodeTests.clear();

        m_topExpr = 0;

        if (m_gotNamespaceError)
            ec = NAMESPACE_ERR;
        else
            ec = XPathException::INVALID_EXPRESSION_ERR;
        return 0;
    }

    ASSERT(m_parseNodes.size() == 1);
    ASSERT(*m_parseNodes.begin() == m_topExpr);
    ASSERT(m_expressionVectors.size() == 0);
    ASSERT(m_predicateVectors.size() == 0);
    ASSERT(m_strings.size() == 0);
    ASSERT(m_nodeTests.size() == 0);

    m_parseNodes.clear();
    Expression* result = m_topExpr;
    m_topExpr = 0;

    return result;
}

void Parser::registerParseNode(ParseNode* node)
{
    if (node == 0)
        return;

    ASSERT(!m_parseNodes.contains(node));

    m_parseNodes.add(node);
}

void Parser::unregisterParseNode(ParseNode* node)
{
    if (node == 0)
        return;

    ASSERT(m_parseNodes.contains(node));

    m_parseNodes.remove(node);
}

void Parser::registerPredicateVector(Vector<Predicate*>* vector)
{
    if (vector == 0)
        return;

    ASSERT(!m_predicateVectors.contains(vector));

    m_predicateVectors.add(vector);
}

void Parser::deletePredicateVector(Vector<Predicate*>* vector)
{
    if (vector == 0)
        return;

    ASSERT(m_predicateVectors.contains(vector));

    m_predicateVectors.remove(vector);
    delete vector;
}


void Parser::registerExpressionVector(Vector<Expression*>* vector)
{
    if (vector == 0)
        return;

    ASSERT(!m_expressionVectors.contains(vector));

    m_expressionVectors.add(vector);
}

void Parser::deleteExpressionVector(Vector<Expression*>* vector)
{
    if (vector == 0)
        return;

    ASSERT(m_expressionVectors.contains(vector));

    m_expressionVectors.remove(vector);
    delete vector;
}

void Parser::registerString(String* s)
{
    if (s == 0)
        return;

    ASSERT(!m_strings.contains(s));

    m_strings.add(s);
}

void Parser::deleteString(String* s)
{
    if (s == 0)
        return;

    ASSERT(m_strings.contains(s));

    m_strings.remove(s);
    delete s;
}

void Parser::registerNodeTest(Step::NodeTest* t)
{
    if (t == 0)
        return;

    ASSERT(!m_nodeTests.contains(t));

    m_nodeTests.add(t);
}

void Parser::deleteNodeTest(Step::NodeTest* t)
{
    if (t == 0)
        return;

    ASSERT(m_nodeTests.contains(t));

    m_nodeTests.remove(t);
    delete t;
}

#endif // ENABLE(XPATH)