1/*
2 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
3 * Copyright (C) 2006 Apple Computer, Inc.
4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "XPathParser.h"
30
31#if ENABLE(XPATH)
32
33#include "ExceptionCode.h"
34#include "StringHash.h"
35#include "XPathEvaluator.h"
36#include "XPathException.h"
37#include "XPathNSResolver.h"
38#include "XPathStep.h"
39#include <wtf/StdLibExtras.h>
40
41int xpathyyparse(void*);
42
43using namespace WTF;
44using namespace Unicode;
45
46namespace WebCore {
47namespace XPath {
48
49class LocationPath;
50
51#include "XPathGrammar.h"
52
53Parser* Parser::currentParser = 0;
54
55enum XMLCat { NameStart, NameCont, NotPartOfName };
56
57typedef HashMap<String, Step::Axis> AxisNamesMap;
58
59static XMLCat charCat(UChar aChar)
60{
61    //### might need to add some special cases from the XML spec.
62
63    if (aChar == '_')
64        return NameStart;
65
66    if (aChar == '.' || aChar == '-')
67        return NameCont;
68    CharCategory category = Unicode::category(aChar);
69    if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
70        return NameStart;
71    if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
72        return NameCont;
73    return NotPartOfName;
74}
75
76static void setUpAxisNamesMap(AxisNamesMap& axisNames)
77{
78    struct AxisName {
79        const char* name;
80        Step::Axis axis;
81    };
82    const AxisName axisNameList[] = {
83        { "ancestor", Step::AncestorAxis },
84        { "ancestor-or-self", Step::AncestorOrSelfAxis },
85        { "attribute", Step::AttributeAxis },
86        { "child", Step::ChildAxis },
87        { "descendant", Step::DescendantAxis },
88        { "descendant-or-self", Step::DescendantOrSelfAxis },
89        { "following", Step::FollowingAxis },
90        { "following-sibling", Step::FollowingSiblingAxis },
91        { "namespace", Step::NamespaceAxis },
92        { "parent", Step::ParentAxis },
93        { "preceding", Step::PrecedingAxis },
94        { "preceding-sibling", Step::PrecedingSiblingAxis },
95        { "self", Step::SelfAxis }
96    };
97    for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
98        axisNames.set(axisNameList[i].name, axisNameList[i].axis);
99}
100
101static bool isAxisName(const String& name, Step::Axis& type)
102{
103    DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
104
105    if (axisNames.isEmpty())
106        setUpAxisNamesMap(axisNames);
107
108    AxisNamesMap::iterator it = axisNames.find(name);
109    if (it == axisNames.end())
110        return false;
111    type = it->second;
112    return true;
113}
114
115static bool isNodeTypeName(const String& name)
116{
117    DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
118    if (nodeTypeNames.isEmpty()) {
119        nodeTypeNames.add("comment");
120        nodeTypeNames.add("text");
121        nodeTypeNames.add("processing-instruction");
122        nodeTypeNames.add("node");
123    }
124    return nodeTypeNames.contains(name);
125}
126
127/* Returns whether the last parsed token matches the [32] Operator rule
128 * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
129 * the tokens.
130 */
131bool Parser::isOperatorContext() const
132{
133    if (m_nextPos == 0)
134        return false;
135
136    switch (m_lastTokenType) {
137        case AND: case OR: case MULOP:
138        case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
139        case EQOP: case RELOP:
140        case '@': case AXISNAME:   case '(': case '[':
141            return false;
142        default:
143            return true;
144    }
145}
146
147void Parser::skipWS()
148{
149    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
150        ++m_nextPos;
151}
152
153Token Parser::makeTokenAndAdvance(int code, int advance)
154{
155    m_nextPos += advance;
156    return Token(code);
157}
158
159Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
160{
161    m_nextPos += advance;
162    return Token(code, val);
163}
164
165Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
166{
167    m_nextPos += advance;
168    return Token(code, val);
169}
170
171// Returns next char if it's there and interesting, 0 otherwise
172char Parser::peekAheadHelper()
173{
174    if (m_nextPos + 1 >= m_data.length())
175        return 0;
176    UChar next = m_data[m_nextPos + 1];
177    if (next >= 0xff)
178        return 0;
179    return next;
180}
181
182char Parser::peekCurHelper()
183{
184    if (m_nextPos >= m_data.length())
185        return 0;
186    UChar next = m_data[m_nextPos];
187    if (next >= 0xff)
188        return 0;
189    return next;
190}
191
192Token Parser::lexString()
193{
194    UChar delimiter = m_data[m_nextPos];
195    int startPos = m_nextPos + 1;
196
197    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
198        if (m_data[m_nextPos] == delimiter) {
199            String value = m_data.substring(startPos, m_nextPos - startPos);
200            if (value.isNull())
201                value = "";
202            ++m_nextPos; // Consume the char.
203            return Token(LITERAL, value);
204        }
205    }
206
207    // Ouch, went off the end -- report error.
208    return Token(XPATH_ERROR);
209}
210
211Token Parser::lexNumber()
212{
213    int startPos = m_nextPos;
214    bool seenDot = false;
215
216    // Go until end or a non-digits character.
217    for (; m_nextPos < m_data.length(); ++m_nextPos) {
218        UChar aChar = m_data[m_nextPos];
219        if (aChar >= 0xff) break;
220
221        if (aChar < '0' || aChar > '9') {
222            if (aChar == '.' && !seenDot)
223                seenDot = true;
224            else
225                break;
226        }
227    }
228
229    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
230}
231
232bool Parser::lexNCName(String& name)
233{
234    int startPos = m_nextPos;
235    if (m_nextPos >= m_data.length())
236        return false;
237
238    if (charCat(m_data[m_nextPos]) != NameStart)
239        return false;
240
241    // Keep going until we get a character that's not good for names.
242    for (; m_nextPos < m_data.length(); ++m_nextPos)
243        if (charCat(m_data[m_nextPos]) == NotPartOfName)
244            break;
245
246    name = m_data.substring(startPos, m_nextPos - startPos);
247    return true;
248}
249
250bool Parser::lexQName(String& name)
251{
252    String n1;
253    if (!lexNCName(n1))
254        return false;
255
256    skipWS();
257
258    // If the next character is :, what we just got it the prefix, if not,
259    // it's the whole thing.
260    if (peekAheadHelper() != ':') {
261        name = n1;
262        return true;
263    }
264
265    String n2;
266    if (!lexNCName(n2))
267        return false;
268
269    name = n1 + ":" + n2;
270    return true;
271}
272
273Token Parser::nextTokenInternal()
274{
275    skipWS();
276
277    if (m_nextPos >= m_data.length())
278        return Token(0);
279
280    char code = peekCurHelper();
281    switch (code) {
282        case '(': case ')': case '[': case ']':
283        case '@': case ',': case '|':
284            return makeTokenAndAdvance(code);
285        case '\'':
286        case '\"':
287            return lexString();
288        case '0': case '1': case '2': case '3': case '4':
289        case '5': case '6': case '7': case '8': case '9':
290            return lexNumber();
291        case '.': {
292            char next = peekAheadHelper();
293            if (next == '.')
294                return makeTokenAndAdvance(DOTDOT, 2);
295            if (next >= '0' && next <= '9')
296                return lexNumber();
297            return makeTokenAndAdvance('.');
298        }
299        case '/':
300            if (peekAheadHelper() == '/')
301                return makeTokenAndAdvance(SLASHSLASH, 2);
302            return makeTokenAndAdvance('/');
303        case '+':
304            return makeTokenAndAdvance(PLUS);
305        case '-':
306            return makeTokenAndAdvance(MINUS);
307        case '=':
308            return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
309        case '!':
310            if (peekAheadHelper() == '=')
311                return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
312            return Token(XPATH_ERROR);
313        case '<':
314            if (peekAheadHelper() == '=')
315                return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
316            return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
317        case '>':
318            if (peekAheadHelper() == '=')
319                return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
320            return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
321        case '*':
322            if (isOperatorContext())
323                return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
324            ++m_nextPos;
325            return Token(NAMETEST, "*");
326        case '$': { // $ QName
327            m_nextPos++;
328            String name;
329            if (!lexQName(name))
330                return Token(XPATH_ERROR);
331            return Token(VARIABLEREFERENCE, name);
332        }
333    }
334
335    String name;
336    if (!lexNCName(name))
337        return Token(XPATH_ERROR);
338
339    skipWS();
340    // If we're in an operator context, check for any operator names
341    if (isOperatorContext()) {
342        if (name == "and") //### hash?
343            return Token(AND);
344        if (name == "or")
345            return Token(OR);
346        if (name == "mod")
347            return Token(MULOP, NumericOp::OP_Mod);
348        if (name == "div")
349            return Token(MULOP, NumericOp::OP_Div);
350    }
351
352    // See whether we are at a :
353    if (peekCurHelper() == ':') {
354        m_nextPos++;
355        // Any chance it's an axis name?
356        if (peekCurHelper() == ':') {
357            m_nextPos++;
358
359            //It might be an axis name.
360            Step::Axis axis;
361            if (isAxisName(name, axis))
362                return Token(AXISNAME, axis);
363            // Ugh, :: is only valid in axis names -> error
364            return Token(XPATH_ERROR);
365        }
366
367        // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
368        skipWS();
369        if (peekCurHelper() == '*') {
370            m_nextPos++;
371            return Token(NAMETEST, name + ":*");
372        }
373
374        // Make a full qname.
375        String n2;
376        if (!lexNCName(n2))
377            return Token(XPATH_ERROR);
378
379        name = name + ":" + n2;
380    }
381
382    skipWS();
383    if (peekCurHelper() == '(') {
384        //note: we don't swallow the (here!
385
386        //either node type of function name
387        if (isNodeTypeName(name)) {
388            if (name == "processing-instruction")
389                return Token(PI, name);
390
391            return Token(NODETYPE, name);
392        }
393        //must be a function name.
394        return Token(FUNCTIONNAME, name);
395    }
396
397    // At this point, it must be NAMETEST.
398    return Token(NAMETEST, name);
399}
400
401Token Parser::nextToken()
402{
403    Token toRet = nextTokenInternal();
404    m_lastTokenType = toRet.type;
405    return toRet;
406}
407
408Parser::Parser()
409{
410    reset(String());
411}
412
413void Parser::reset(const String& data)
414{
415    m_nextPos = 0;
416    m_data = data;
417    m_lastTokenType = 0;
418
419    m_topExpr = 0;
420    m_gotNamespaceError = false;
421}
422
423int Parser::lex(void* data)
424{
425    YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
426    Token tok = nextToken();
427
428    switch (tok.type) {
429        case AXISNAME:
430            yylval->axis = tok.axis;
431            break;
432        case MULOP:
433            yylval->numop = tok.numop;
434            break;
435        case RELOP:
436        case EQOP:
437            yylval->eqop = tok.eqop;
438            break;
439        case NODETYPE:
440        case PI:
441        case FUNCTIONNAME:
442        case LITERAL:
443        case VARIABLEREFERENCE:
444        case NUMBER:
445        case NAMETEST:
446            yylval->str = new String(tok.str);
447            registerString(yylval->str);
448            break;
449    }
450
451    return tok.type;
452}
453
454bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
455{
456    int colon = qName.find(':');
457    if (colon >= 0) {
458        if (!m_resolver)
459            return false;
460        namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
461        if (namespaceURI.isNull())
462            return false;
463        localName = qName.substring(colon + 1);
464    } else
465        localName = qName;
466
467    return true;
468}
469
470Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
471{
472    reset(statement);
473
474    m_resolver = resolver;
475
476    Parser* oldParser = currentParser;
477    currentParser = this;
478    int parseError = xpathyyparse(this);
479    currentParser = oldParser;
480
481    if (parseError) {
482        deleteAllValues(m_parseNodes);
483        m_parseNodes.clear();
484
485        HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
486        for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
487            deleteAllValues(**it);
488            delete *it;
489        }
490        m_predicateVectors.clear();
491
492        HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
493        for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
494            deleteAllValues(**it);
495            delete *it;
496        }
497        m_expressionVectors.clear();
498
499        deleteAllValues(m_strings);
500        m_strings.clear();
501
502        deleteAllValues(m_nodeTests);
503        m_nodeTests.clear();
504
505        m_topExpr = 0;
506
507        if (m_gotNamespaceError)
508            ec = NAMESPACE_ERR;
509        else
510            ec = XPathException::INVALID_EXPRESSION_ERR;
511        return 0;
512    }
513
514    ASSERT(m_parseNodes.size() == 1);
515    ASSERT(*m_parseNodes.begin() == m_topExpr);
516    ASSERT(m_expressionVectors.size() == 0);
517    ASSERT(m_predicateVectors.size() == 0);
518    ASSERT(m_strings.size() == 0);
519    ASSERT(m_nodeTests.size() == 0);
520
521    m_parseNodes.clear();
522    Expression* result = m_topExpr;
523    m_topExpr = 0;
524
525    return result;
526}
527
528void Parser::registerParseNode(ParseNode* node)
529{
530    if (node == 0)
531        return;
532
533    ASSERT(!m_parseNodes.contains(node));
534
535    m_parseNodes.add(node);
536}
537
538void Parser::unregisterParseNode(ParseNode* node)
539{
540    if (node == 0)
541        return;
542
543    ASSERT(m_parseNodes.contains(node));
544
545    m_parseNodes.remove(node);
546}
547
548void Parser::registerPredicateVector(Vector<Predicate*>* vector)
549{
550    if (vector == 0)
551        return;
552
553    ASSERT(!m_predicateVectors.contains(vector));
554
555    m_predicateVectors.add(vector);
556}
557
558void Parser::deletePredicateVector(Vector<Predicate*>* vector)
559{
560    if (vector == 0)
561        return;
562
563    ASSERT(m_predicateVectors.contains(vector));
564
565    m_predicateVectors.remove(vector);
566    delete vector;
567}
568
569
570void Parser::registerExpressionVector(Vector<Expression*>* vector)
571{
572    if (vector == 0)
573        return;
574
575    ASSERT(!m_expressionVectors.contains(vector));
576
577    m_expressionVectors.add(vector);
578}
579
580void Parser::deleteExpressionVector(Vector<Expression*>* vector)
581{
582    if (vector == 0)
583        return;
584
585    ASSERT(m_expressionVectors.contains(vector));
586
587    m_expressionVectors.remove(vector);
588    delete vector;
589}
590
591void Parser::registerString(String* s)
592{
593    if (s == 0)
594        return;
595
596    ASSERT(!m_strings.contains(s));
597
598    m_strings.add(s);
599}
600
601void Parser::deleteString(String* s)
602{
603    if (s == 0)
604        return;
605
606    ASSERT(m_strings.contains(s));
607
608    m_strings.remove(s);
609    delete s;
610}
611
612void Parser::registerNodeTest(Step::NodeTest* t)
613{
614    if (t == 0)
615        return;
616
617    ASSERT(!m_nodeTests.contains(t));
618
619    m_nodeTests.add(t);
620}
621
622void Parser::deleteNodeTest(Step::NodeTest* t)
623{
624    if (t == 0)
625        return;
626
627    ASSERT(m_nodeTests.contains(t));
628
629    m_nodeTests.remove(t);
630    delete t;
631}
632
633}
634}
635
636#endif // ENABLE(XPATH)
637