1/*
2 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
3 * Copyright (C) 2006 Apple Computer, Inc.
4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "XPathParser.h"
30
31#if ENABLE(XPATH)
32
33#include "ExceptionCode.h"
34#include "XPathEvaluator.h"
35#include "XPathException.h"
36#include "XPathNSResolver.h"
37#include "XPathPath.h"
38#include "XPathStep.h"
39#include <wtf/StdLibExtras.h>
40#include <wtf/text/StringHash.h>
41
42using namespace WebCore;
43using namespace WTF;
44using namespace Unicode;
45using namespace XPath;
46
47extern int xpathyyparse(WebCore::XPath::Parser*);
48#include "XPathGrammar.h"
49
50Parser* Parser::currentParser = 0;
51
52enum XMLCat { NameStart, NameCont, NotPartOfName };
53
54typedef HashMap<String, Step::Axis> AxisNamesMap;
55
56static XMLCat charCat(UChar aChar)
57{
58    //### might need to add some special cases from the XML spec.
59
60    if (aChar == '_')
61        return NameStart;
62
63    if (aChar == '.' || aChar == '-')
64        return NameCont;
65    CharCategory category = Unicode::category(aChar);
66    if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
67        return NameStart;
68    if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
69        return NameCont;
70    return NotPartOfName;
71}
72
73static void setUpAxisNamesMap(AxisNamesMap& axisNames)
74{
75    struct AxisName {
76        const char* name;
77        Step::Axis axis;
78    };
79    const AxisName axisNameList[] = {
80        { "ancestor", Step::AncestorAxis },
81        { "ancestor-or-self", Step::AncestorOrSelfAxis },
82        { "attribute", Step::AttributeAxis },
83        { "child", Step::ChildAxis },
84        { "descendant", Step::DescendantAxis },
85        { "descendant-or-self", Step::DescendantOrSelfAxis },
86        { "following", Step::FollowingAxis },
87        { "following-sibling", Step::FollowingSiblingAxis },
88        { "namespace", Step::NamespaceAxis },
89        { "parent", Step::ParentAxis },
90        { "preceding", Step::PrecedingAxis },
91        { "preceding-sibling", Step::PrecedingSiblingAxis },
92        { "self", Step::SelfAxis }
93    };
94    for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
95        axisNames.set(axisNameList[i].name, axisNameList[i].axis);
96}
97
98static bool isAxisName(const String& name, Step::Axis& type)
99{
100    DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
101
102    if (axisNames.isEmpty())
103        setUpAxisNamesMap(axisNames);
104
105    AxisNamesMap::iterator it = axisNames.find(name);
106    if (it == axisNames.end())
107        return false;
108    type = it->second;
109    return true;
110}
111
112static bool isNodeTypeName(const String& name)
113{
114    DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
115    if (nodeTypeNames.isEmpty()) {
116        nodeTypeNames.add("comment");
117        nodeTypeNames.add("text");
118        nodeTypeNames.add("processing-instruction");
119        nodeTypeNames.add("node");
120    }
121    return nodeTypeNames.contains(name);
122}
123
124// Returns whether the current token can possibly be a binary operator, given
125// the previous token. Necessary to disambiguate some of the operators
126// (* (multiply), div, and, or, mod) in the [32] Operator rule
127// (check http://www.w3.org/TR/xpath#exprlex).
128bool Parser::isBinaryOperatorContext() const
129{
130    switch (m_lastTokenType) {
131    case 0:
132    case '@': case AXISNAME: case '(': case '[': case ',':
133    case AND: case OR: case MULOP:
134    case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
135    case EQOP: case RELOP:
136        return false;
137    default:
138        return true;
139    }
140}
141
142void Parser::skipWS()
143{
144    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
145        ++m_nextPos;
146}
147
148Token Parser::makeTokenAndAdvance(int code, int advance)
149{
150    m_nextPos += advance;
151    return Token(code);
152}
153
154Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
155{
156    m_nextPos += advance;
157    return Token(code, val);
158}
159
160Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
161{
162    m_nextPos += advance;
163    return Token(code, val);
164}
165
166// Returns next char if it's there and interesting, 0 otherwise
167char Parser::peekAheadHelper()
168{
169    if (m_nextPos + 1 >= m_data.length())
170        return 0;
171    UChar next = m_data[m_nextPos + 1];
172    if (next >= 0xff)
173        return 0;
174    return next;
175}
176
177char Parser::peekCurHelper()
178{
179    if (m_nextPos >= m_data.length())
180        return 0;
181    UChar next = m_data[m_nextPos];
182    if (next >= 0xff)
183        return 0;
184    return next;
185}
186
187Token Parser::lexString()
188{
189    UChar delimiter = m_data[m_nextPos];
190    int startPos = m_nextPos + 1;
191
192    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
193        if (m_data[m_nextPos] == delimiter) {
194            String value = m_data.substring(startPos, m_nextPos - startPos);
195            if (value.isNull())
196                value = "";
197            ++m_nextPos; // Consume the char.
198            return Token(LITERAL, value);
199        }
200    }
201
202    // Ouch, went off the end -- report error.
203    return Token(XPATH_ERROR);
204}
205
206Token Parser::lexNumber()
207{
208    int startPos = m_nextPos;
209    bool seenDot = false;
210
211    // Go until end or a non-digits character.
212    for (; m_nextPos < m_data.length(); ++m_nextPos) {
213        UChar aChar = m_data[m_nextPos];
214        if (aChar >= 0xff) break;
215
216        if (aChar < '0' || aChar > '9') {
217            if (aChar == '.' && !seenDot)
218                seenDot = true;
219            else
220                break;
221        }
222    }
223
224    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
225}
226
227bool Parser::lexNCName(String& name)
228{
229    int startPos = m_nextPos;
230    if (m_nextPos >= m_data.length())
231        return false;
232
233    if (charCat(m_data[m_nextPos]) != NameStart)
234        return false;
235
236    // Keep going until we get a character that's not good for names.
237    for (; m_nextPos < m_data.length(); ++m_nextPos)
238        if (charCat(m_data[m_nextPos]) == NotPartOfName)
239            break;
240
241    name = m_data.substring(startPos, m_nextPos - startPos);
242    return true;
243}
244
245bool Parser::lexQName(String& name)
246{
247    String n1;
248    if (!lexNCName(n1))
249        return false;
250
251    skipWS();
252
253    // If the next character is :, what we just got it the prefix, if not,
254    // it's the whole thing.
255    if (peekAheadHelper() != ':') {
256        name = n1;
257        return true;
258    }
259
260    String n2;
261    if (!lexNCName(n2))
262        return false;
263
264    name = n1 + ":" + n2;
265    return true;
266}
267
268Token Parser::nextTokenInternal()
269{
270    skipWS();
271
272    if (m_nextPos >= m_data.length())
273        return Token(0);
274
275    char code = peekCurHelper();
276    switch (code) {
277    case '(': case ')': case '[': case ']':
278    case '@': case ',': case '|':
279        return makeTokenAndAdvance(code);
280    case '\'':
281    case '\"':
282        return lexString();
283    case '0': case '1': case '2': case '3': case '4':
284    case '5': case '6': case '7': case '8': case '9':
285        return lexNumber();
286    case '.': {
287        char next = peekAheadHelper();
288        if (next == '.')
289            return makeTokenAndAdvance(DOTDOT, 2);
290        if (next >= '0' && next <= '9')
291            return lexNumber();
292        return makeTokenAndAdvance('.');
293    }
294    case '/':
295        if (peekAheadHelper() == '/')
296            return makeTokenAndAdvance(SLASHSLASH, 2);
297        return makeTokenAndAdvance('/');
298    case '+':
299        return makeTokenAndAdvance(PLUS);
300    case '-':
301        return makeTokenAndAdvance(MINUS);
302    case '=':
303        return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
304    case '!':
305        if (peekAheadHelper() == '=')
306            return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
307        return Token(XPATH_ERROR);
308    case '<':
309        if (peekAheadHelper() == '=')
310            return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
311        return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
312    case '>':
313        if (peekAheadHelper() == '=')
314            return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
315        return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
316    case '*':
317        if (isBinaryOperatorContext())
318            return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
319        ++m_nextPos;
320        return Token(NAMETEST, "*");
321    case '$': { // $ QName
322        m_nextPos++;
323        String name;
324        if (!lexQName(name))
325            return Token(XPATH_ERROR);
326        return Token(VARIABLEREFERENCE, name);
327    }
328    }
329
330    String name;
331    if (!lexNCName(name))
332        return Token(XPATH_ERROR);
333
334    skipWS();
335    // If we're in an operator context, check for any operator names
336    if (isBinaryOperatorContext()) {
337        if (name == "and") //### hash?
338            return Token(AND);
339        if (name == "or")
340            return Token(OR);
341        if (name == "mod")
342            return Token(MULOP, NumericOp::OP_Mod);
343        if (name == "div")
344            return Token(MULOP, NumericOp::OP_Div);
345    }
346
347    // See whether we are at a :
348    if (peekCurHelper() == ':') {
349        m_nextPos++;
350        // Any chance it's an axis name?
351        if (peekCurHelper() == ':') {
352            m_nextPos++;
353
354            //It might be an axis name.
355            Step::Axis axis;
356            if (isAxisName(name, axis))
357                return Token(AXISNAME, axis);
358            // Ugh, :: is only valid in axis names -> error
359            return Token(XPATH_ERROR);
360        }
361
362        // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
363        skipWS();
364        if (peekCurHelper() == '*') {
365            m_nextPos++;
366            return Token(NAMETEST, name + ":*");
367        }
368
369        // Make a full qname.
370        String n2;
371        if (!lexNCName(n2))
372            return Token(XPATH_ERROR);
373
374        name = name + ":" + n2;
375    }
376
377    skipWS();
378    if (peekCurHelper() == '(') {
379        //note: we don't swallow the (here!
380
381        //either node type of function name
382        if (isNodeTypeName(name)) {
383            if (name == "processing-instruction")
384                return Token(PI, name);
385
386            return Token(NODETYPE, name);
387        }
388        //must be a function name.
389        return Token(FUNCTIONNAME, name);
390    }
391
392    // At this point, it must be NAMETEST.
393    return Token(NAMETEST, name);
394}
395
396Token Parser::nextToken()
397{
398    Token toRet = nextTokenInternal();
399    m_lastTokenType = toRet.type;
400    return toRet;
401}
402
403Parser::Parser()
404{
405    reset(String());
406}
407
408Parser::~Parser()
409{
410}
411
412void Parser::reset(const String& data)
413{
414    m_nextPos = 0;
415    m_data = data;
416    m_lastTokenType = 0;
417
418    m_topExpr = 0;
419    m_gotNamespaceError = false;
420}
421
422int Parser::lex(void* data)
423{
424    YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
425    Token tok = nextToken();
426
427    switch (tok.type) {
428    case AXISNAME:
429        yylval->axis = tok.axis;
430        break;
431    case MULOP:
432        yylval->numop = tok.numop;
433        break;
434    case RELOP:
435    case EQOP:
436        yylval->eqop = tok.eqop;
437        break;
438    case NODETYPE:
439    case PI:
440    case FUNCTIONNAME:
441    case LITERAL:
442    case VARIABLEREFERENCE:
443    case NUMBER:
444    case NAMETEST:
445        yylval->str = new String(tok.str);
446        registerString(yylval->str);
447        break;
448    }
449
450    return tok.type;
451}
452
453bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
454{
455    size_t colon = qName.find(':');
456    if (colon != notFound) {
457        if (!m_resolver)
458            return false;
459        namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
460        if (namespaceURI.isNull())
461            return false;
462        localName = qName.substring(colon + 1);
463    } else
464        localName = qName;
465
466    return true;
467}
468
469Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
470{
471    reset(statement);
472
473    m_resolver = resolver;
474
475    Parser* oldParser = currentParser;
476    currentParser = this;
477    int parseError = xpathyyparse(this);
478    currentParser = oldParser;
479
480    if (parseError) {
481        deleteAllValues(m_parseNodes);
482        m_parseNodes.clear();
483
484        HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
485        for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
486            deleteAllValues(**it);
487            delete *it;
488        }
489        m_predicateVectors.clear();
490
491        HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
492        for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
493            deleteAllValues(**it);
494            delete *it;
495        }
496        m_expressionVectors.clear();
497
498        deleteAllValues(m_strings);
499        m_strings.clear();
500
501        deleteAllValues(m_nodeTests);
502        m_nodeTests.clear();
503
504        m_topExpr = 0;
505
506        if (m_gotNamespaceError)
507            ec = NAMESPACE_ERR;
508        else
509            ec = XPathException::INVALID_EXPRESSION_ERR;
510        return 0;
511    }
512
513    ASSERT(m_parseNodes.size() == 1);
514    ASSERT(*m_parseNodes.begin() == m_topExpr);
515    ASSERT(m_expressionVectors.size() == 0);
516    ASSERT(m_predicateVectors.size() == 0);
517    ASSERT(m_strings.size() == 0);
518    ASSERT(m_nodeTests.size() == 0);
519
520    m_parseNodes.clear();
521    Expression* result = m_topExpr;
522    m_topExpr = 0;
523
524    return result;
525}
526
527void Parser::registerParseNode(ParseNode* node)
528{
529    if (node == 0)
530        return;
531
532    ASSERT(!m_parseNodes.contains(node));
533
534    m_parseNodes.add(node);
535}
536
537void Parser::unregisterParseNode(ParseNode* node)
538{
539    if (node == 0)
540        return;
541
542    ASSERT(m_parseNodes.contains(node));
543
544    m_parseNodes.remove(node);
545}
546
547void Parser::registerPredicateVector(Vector<Predicate*>* vector)
548{
549    if (vector == 0)
550        return;
551
552    ASSERT(!m_predicateVectors.contains(vector));
553
554    m_predicateVectors.add(vector);
555}
556
557void Parser::deletePredicateVector(Vector<Predicate*>* vector)
558{
559    if (vector == 0)
560        return;
561
562    ASSERT(m_predicateVectors.contains(vector));
563
564    m_predicateVectors.remove(vector);
565    delete vector;
566}
567
568
569void Parser::registerExpressionVector(Vector<Expression*>* vector)
570{
571    if (vector == 0)
572        return;
573
574    ASSERT(!m_expressionVectors.contains(vector));
575
576    m_expressionVectors.add(vector);
577}
578
579void Parser::deleteExpressionVector(Vector<Expression*>* vector)
580{
581    if (vector == 0)
582        return;
583
584    ASSERT(m_expressionVectors.contains(vector));
585
586    m_expressionVectors.remove(vector);
587    delete vector;
588}
589
590void Parser::registerString(String* s)
591{
592    if (s == 0)
593        return;
594
595    ASSERT(!m_strings.contains(s));
596
597    m_strings.add(s);
598}
599
600void Parser::deleteString(String* s)
601{
602    if (s == 0)
603        return;
604
605    ASSERT(m_strings.contains(s));
606
607    m_strings.remove(s);
608    delete s;
609}
610
611void Parser::registerNodeTest(Step::NodeTest* t)
612{
613    if (t == 0)
614        return;
615
616    ASSERT(!m_nodeTests.contains(t));
617
618    m_nodeTests.add(t);
619}
620
621void Parser::deleteNodeTest(Step::NodeTest* t)
622{
623    if (t == 0)
624        return;
625
626    ASSERT(m_nodeTests.contains(t));
627
628    m_nodeTests.remove(t);
629    delete t;
630}
631
632#endif // ENABLE(XPATH)
633