1/*
2 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
3 * Copyright (C) 2006 Apple Computer, Inc.
4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "core/xml/XPathParser.h"
30
31#include "bindings/v8/ExceptionState.h"
32#include "core/XPathGrammar.h"
33#include "core/dom/ExceptionCode.h"
34#include "core/xml/XPathEvaluator.h"
35#include "core/xml/XPathNSResolver.h"
36#include "core/xml/XPathPath.h"
37#include "wtf/StdLibExtras.h"
38#include "wtf/text/StringHash.h"
39
40using namespace WebCore;
41using namespace WTF;
42using namespace Unicode;
43using namespace XPath;
44
45Parser* Parser::currentParser = 0;
46
47enum XMLCat { NameStart, NameCont, NotPartOfName };
48
49typedef HashMap<String, Step::Axis> AxisNamesMap;
50
51static XMLCat charCat(UChar aChar)
52{
53    // might need to add some special cases from the XML spec.
54
55    if (aChar == '_')
56        return NameStart;
57
58    if (aChar == '.' || aChar == '-')
59        return NameCont;
60    CharCategory category = Unicode::category(aChar);
61    if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
62        return NameStart;
63    if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
64        return NameCont;
65    return NotPartOfName;
66}
67
68static void setUpAxisNamesMap(AxisNamesMap& axisNames)
69{
70    struct AxisName {
71        const char* name;
72        Step::Axis axis;
73    };
74    const AxisName axisNameList[] = {
75        { "ancestor", Step::AncestorAxis },
76        { "ancestor-or-self", Step::AncestorOrSelfAxis },
77        { "attribute", Step::AttributeAxis },
78        { "child", Step::ChildAxis },
79        { "descendant", Step::DescendantAxis },
80        { "descendant-or-self", Step::DescendantOrSelfAxis },
81        { "following", Step::FollowingAxis },
82        { "following-sibling", Step::FollowingSiblingAxis },
83        { "namespace", Step::NamespaceAxis },
84        { "parent", Step::ParentAxis },
85        { "preceding", Step::PrecedingAxis },
86        { "preceding-sibling", Step::PrecedingSiblingAxis },
87        { "self", Step::SelfAxis }
88    };
89    for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
90        axisNames.set(axisNameList[i].name, axisNameList[i].axis);
91}
92
93static bool isAxisName(const String& name, Step::Axis& type)
94{
95    DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
96
97    if (axisNames.isEmpty())
98        setUpAxisNamesMap(axisNames);
99
100    AxisNamesMap::iterator it = axisNames.find(name);
101    if (it == axisNames.end())
102        return false;
103    type = it->value;
104    return true;
105}
106
107static bool isNodeTypeName(const String& name)
108{
109    DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
110    if (nodeTypeNames.isEmpty()) {
111        nodeTypeNames.add("comment");
112        nodeTypeNames.add("text");
113        nodeTypeNames.add("processing-instruction");
114        nodeTypeNames.add("node");
115    }
116    return nodeTypeNames.contains(name);
117}
118
119// Returns whether the current token can possibly be a binary operator, given
120// the previous token. Necessary to disambiguate some of the operators
121// (* (multiply), div, and, or, mod) in the [32] Operator rule
122// (check http://www.w3.org/TR/xpath#exprlex).
123bool Parser::isBinaryOperatorContext() const
124{
125    switch (m_lastTokenType) {
126    case 0:
127    case '@': case AXISNAME: case '(': case '[': case ',':
128    case AND: case OR: case MULOP:
129    case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
130    case EQOP: case RELOP:
131        return false;
132    default:
133        return true;
134    }
135}
136
137void Parser::skipWS()
138{
139    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
140        ++m_nextPos;
141}
142
143Token Parser::makeTokenAndAdvance(int code, int advance)
144{
145    m_nextPos += advance;
146    return Token(code);
147}
148
149Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
150{
151    m_nextPos += advance;
152    return Token(code, val);
153}
154
155Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
156{
157    m_nextPos += advance;
158    return Token(code, val);
159}
160
161// Returns next char if it's there and interesting, 0 otherwise
162char Parser::peekAheadHelper()
163{
164    if (m_nextPos + 1 >= m_data.length())
165        return 0;
166    UChar next = m_data[m_nextPos + 1];
167    if (next >= 0xff)
168        return 0;
169    return next;
170}
171
172char Parser::peekCurHelper()
173{
174    if (m_nextPos >= m_data.length())
175        return 0;
176    UChar next = m_data[m_nextPos];
177    if (next >= 0xff)
178        return 0;
179    return next;
180}
181
182Token Parser::lexString()
183{
184    UChar delimiter = m_data[m_nextPos];
185    int startPos = m_nextPos + 1;
186
187    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
188        if (m_data[m_nextPos] == delimiter) {
189            String value = m_data.substring(startPos, m_nextPos - startPos);
190            if (value.isNull())
191                value = "";
192            ++m_nextPos; // Consume the char.
193            return Token(LITERAL, value);
194        }
195    }
196
197    // Ouch, went off the end -- report error.
198    return Token(XPATH_ERROR);
199}
200
201Token Parser::lexNumber()
202{
203    int startPos = m_nextPos;
204    bool seenDot = false;
205
206    // Go until end or a non-digits character.
207    for (; m_nextPos < m_data.length(); ++m_nextPos) {
208        UChar aChar = m_data[m_nextPos];
209        if (aChar >= 0xff)
210            break;
211
212        if (aChar < '0' || aChar > '9') {
213            if (aChar == '.' && !seenDot)
214                seenDot = true;
215            else
216                break;
217        }
218    }
219
220    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
221}
222
223bool Parser::lexNCName(String& name)
224{
225    int startPos = m_nextPos;
226    if (m_nextPos >= m_data.length())
227        return false;
228
229    if (charCat(m_data[m_nextPos]) != NameStart)
230        return false;
231
232    // Keep going until we get a character that's not good for names.
233    for (; m_nextPos < m_data.length(); ++m_nextPos) {
234        if (charCat(m_data[m_nextPos]) == NotPartOfName)
235            break;
236    }
237
238    name = m_data.substring(startPos, m_nextPos - startPos);
239    return true;
240}
241
242bool Parser::lexQName(String& name)
243{
244    String n1;
245    if (!lexNCName(n1))
246        return false;
247
248    skipWS();
249
250    // If the next character is :, what we just got it the prefix, if not,
251    // it's the whole thing.
252    if (peekAheadHelper() != ':') {
253        name = n1;
254        return true;
255    }
256
257    String n2;
258    if (!lexNCName(n2))
259        return false;
260
261    name = n1 + ":" + n2;
262    return true;
263}
264
265Token Parser::nextTokenInternal()
266{
267    skipWS();
268
269    if (m_nextPos >= m_data.length())
270        return Token(0);
271
272    char code = peekCurHelper();
273    switch (code) {
274    case '(': case ')': case '[': case ']':
275    case '@': case ',': case '|':
276        return makeTokenAndAdvance(code);
277    case '\'':
278    case '\"':
279        return lexString();
280    case '0': case '1': case '2': case '3': case '4':
281    case '5': case '6': case '7': case '8': case '9':
282        return lexNumber();
283    case '.': {
284        char next = peekAheadHelper();
285        if (next == '.')
286            return makeTokenAndAdvance(DOTDOT, 2);
287        if (next >= '0' && next <= '9')
288            return lexNumber();
289        return makeTokenAndAdvance('.');
290    }
291    case '/':
292        if (peekAheadHelper() == '/')
293            return makeTokenAndAdvance(SLASHSLASH, 2);
294        return makeTokenAndAdvance('/');
295    case '+':
296        return makeTokenAndAdvance(PLUS);
297    case '-':
298        return makeTokenAndAdvance(MINUS);
299    case '=':
300        return makeTokenAndAdvance(EQOP, EqTestOp::OpcodeEqual);
301    case '!':
302        if (peekAheadHelper() == '=')
303            return makeTokenAndAdvance(EQOP, EqTestOp::OpcodeNotEqual, 2);
304        return Token(XPATH_ERROR);
305    case '<':
306        if (peekAheadHelper() == '=')
307            return makeTokenAndAdvance(RELOP, EqTestOp::OpcodeLessOrEqual, 2);
308        return makeTokenAndAdvance(RELOP, EqTestOp::OpcodeLessThan);
309    case '>':
310        if (peekAheadHelper() == '=')
311            return makeTokenAndAdvance(RELOP, EqTestOp::OpcodeGreaterOrEqual, 2);
312        return makeTokenAndAdvance(RELOP, EqTestOp::OpcodeGreaterThan);
313    case '*':
314        if (isBinaryOperatorContext())
315            return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
316        ++m_nextPos;
317        return Token(NAMETEST, "*");
318    case '$': { // $ QName
319        m_nextPos++;
320        String name;
321        if (!lexQName(name))
322            return Token(XPATH_ERROR);
323        return Token(VARIABLEREFERENCE, name);
324    }
325    }
326
327    String name;
328    if (!lexNCName(name))
329        return Token(XPATH_ERROR);
330
331    skipWS();
332    // If we're in an operator context, check for any operator names
333    if (isBinaryOperatorContext()) {
334        if (name == "and") // ### hash?
335            return Token(AND);
336        if (name == "or")
337            return Token(OR);
338        if (name == "mod")
339            return Token(MULOP, NumericOp::OP_Mod);
340        if (name == "div")
341            return Token(MULOP, NumericOp::OP_Div);
342    }
343
344    // See whether we are at a :
345    if (peekCurHelper() == ':') {
346        m_nextPos++;
347        // Any chance it's an axis name?
348        if (peekCurHelper() == ':') {
349            m_nextPos++;
350
351            // It might be an axis name.
352            Step::Axis axis;
353            if (isAxisName(name, axis))
354                return Token(AXISNAME, axis);
355            // Ugh, :: is only valid in axis names -> error
356            return Token(XPATH_ERROR);
357        }
358
359        // Seems like this is a fully qualified qname, or perhaps the * modified
360        // one from NameTest
361        skipWS();
362        if (peekCurHelper() == '*') {
363            m_nextPos++;
364            return Token(NAMETEST, name + ":*");
365        }
366
367        // Make a full qname.
368        String n2;
369        if (!lexNCName(n2))
370            return Token(XPATH_ERROR);
371
372        name = name + ":" + n2;
373    }
374
375    skipWS();
376    if (peekCurHelper() == '(') {
377        // Note: we don't swallow the ( here!
378
379        // Either node type of function name
380        if (isNodeTypeName(name)) {
381            if (name == "processing-instruction")
382                return Token(PI, name);
383
384            return Token(NODETYPE, name);
385        }
386        // Must be a function name.
387        return Token(FUNCTIONNAME, name);
388    }
389
390    // At this point, it must be NAMETEST.
391    return Token(NAMETEST, name);
392}
393
394Token Parser::nextToken()
395{
396    Token toRet = nextTokenInternal();
397    m_lastTokenType = toRet.type;
398    return toRet;
399}
400
401Parser::Parser()
402{
403    reset(String());
404}
405
406Parser::~Parser()
407{
408}
409
410void Parser::reset(const String& data)
411{
412    m_nextPos = 0;
413    m_data = data;
414    m_lastTokenType = 0;
415
416    m_topExpr = nullptr;
417    m_gotNamespaceError = false;
418}
419
420int Parser::lex(void* data)
421{
422    YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
423    Token tok = nextToken();
424
425    switch (tok.type) {
426    case AXISNAME:
427        yylval->axis = tok.axis;
428        break;
429    case MULOP:
430        yylval->numop = tok.numop;
431        break;
432    case RELOP:
433    case EQOP:
434        yylval->eqop = tok.eqop;
435        break;
436    case NODETYPE:
437    case PI:
438    case FUNCTIONNAME:
439    case LITERAL:
440    case VARIABLEREFERENCE:
441    case NUMBER:
442    case NAMETEST:
443        yylval->str = new String(tok.str);
444        registerString(yylval->str);
445        break;
446    }
447
448    return tok.type;
449}
450
451bool Parser::expandQName(const String& qName, AtomicString& localName, AtomicString& namespaceURI)
452{
453    size_t colon = qName.find(':');
454    if (colon != kNotFound) {
455        if (!m_resolver)
456            return false;
457        namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
458        if (namespaceURI.isNull())
459            return false;
460        localName = AtomicString(qName.substring(colon + 1));
461    } else {
462        localName = AtomicString(qName);
463    }
464
465    return true;
466}
467
468PassOwnPtrWillBeRawPtr<Expression> Parser::parseStatement(const String& statement, PassRefPtrWillBeRawPtr<XPathNSResolver> resolver, ExceptionState& exceptionState)
469{
470    reset(statement);
471
472    m_resolver = resolver;
473
474    Parser* oldParser = currentParser;
475    currentParser = this;
476    int parseError = xpathyyparse(this);
477    currentParser = oldParser;
478
479    if (parseError) {
480#if !ENABLE(OILPAN)
481        while (!m_parseNodes.isEmpty())
482            delete m_parseNodes.takeAny();
483
484        HashSet<Vector<OwnPtr<Predicate> >*>::iterator pend = m_predicateVectors.end();
485        for (HashSet<Vector<OwnPtr<Predicate> >*>::iterator it = m_predicateVectors.begin(); it != pend; ++it)
486            delete *it;
487        m_predicateVectors.clear();
488
489        HashSet<Vector<OwnPtr<Expression> >*>::iterator eend = m_expressionVectors.end();
490        for (HashSet<Vector<OwnPtr<Expression> >*>::iterator it = m_expressionVectors.begin(); it != eend; ++it)
491            delete *it;
492        m_expressionVectors.clear();
493
494        m_nodeTests.clear();
495#endif
496
497        m_strings.clear();
498
499        m_topExpr = nullptr;
500
501        if (m_gotNamespaceError)
502            exceptionState.throwDOMException(NamespaceError, "The string '" + statement + "' contains unresolvable namespaces.");
503        else
504            exceptionState.throwDOMException(SyntaxError, "The string '" + statement + "' is not a valid XPath expression.");
505        return nullptr;
506    }
507    ASSERT(m_strings.size() == 0);
508#if !ENABLE(OILPAN)
509    ASSERT(m_parseNodes.size() == 1);
510    ASSERT(*m_parseNodes.begin() == m_topExpr);
511    ASSERT(m_expressionVectors.size() == 0);
512    ASSERT(m_predicateVectors.size() == 0);
513    ASSERT(m_nodeTests.size() == 0);
514    m_parseNodes.clear();
515#endif
516
517    Expression* result = m_topExpr;
518    m_topExpr = nullptr;
519
520    return adoptPtrWillBeNoop(result);
521}
522
523void Parser::registerParseNode(ParseNode* node)
524{
525#if !ENABLE(OILPAN)
526    if (node == 0)
527        return;
528
529    ASSERT(!m_parseNodes.contains(node));
530
531    m_parseNodes.add(node);
532#endif
533}
534
535void Parser::unregisterParseNode(ParseNode* node)
536{
537#if !ENABLE(OILPAN)
538    if (node == 0)
539        return;
540
541    ASSERT(m_parseNodes.contains(node));
542
543    m_parseNodes.remove(node);
544#endif
545}
546
547void Parser::registerPredicateVector(WillBeHeapVector<OwnPtrWillBeMember<Predicate> >* vector)
548{
549#if !ENABLE(OILPAN)
550    if (vector == 0)
551        return;
552
553    ASSERT(!m_predicateVectors.contains(vector));
554
555    m_predicateVectors.add(vector);
556#endif
557}
558
559void Parser::deletePredicateVector(WillBeHeapVector<OwnPtrWillBeMember<Predicate> >* vector)
560{
561#if !ENABLE(OILPAN)
562    if (vector == 0)
563        return;
564
565    ASSERT(m_predicateVectors.contains(vector));
566
567    m_predicateVectors.remove(vector);
568    delete vector;
569#endif
570}
571
572
573void Parser::registerExpressionVector(WillBeHeapVector<OwnPtrWillBeMember<Expression> >* vector)
574{
575#if !ENABLE(OILPAN)
576    if (vector == 0)
577        return;
578
579    ASSERT(!m_expressionVectors.contains(vector));
580
581    m_expressionVectors.add(vector);
582#endif
583}
584
585void Parser::deleteExpressionVector(WillBeHeapVector<OwnPtrWillBeMember<Expression> >* vector)
586{
587#if !ENABLE(OILPAN)
588    if (vector == 0)
589        return;
590
591    ASSERT(m_expressionVectors.contains(vector));
592
593    m_expressionVectors.remove(vector);
594    delete vector;
595#endif
596}
597
598void Parser::registerString(String* s)
599{
600    if (s == 0)
601        return;
602
603    ASSERT(!m_strings.contains(s));
604
605    m_strings.add(adoptPtr(s));
606}
607
608void Parser::deleteString(String* s)
609{
610    if (s == 0)
611        return;
612
613    ASSERT(m_strings.contains(s));
614
615    m_strings.remove(s);
616}
617
618void Parser::registerNodeTest(Step::NodeTest* t)
619{
620#if !ENABLE(OILPAN)
621    if (t == 0)
622        return;
623
624    ASSERT(!m_nodeTests.contains(t));
625
626    m_nodeTests.add(adoptPtr(t));
627#endif
628}
629
630void Parser::deleteNodeTest(Step::NodeTest* t)
631{
632#if !ENABLE(OILPAN)
633    if (t == 0)
634        return;
635
636    ASSERT(m_nodeTests.contains(t));
637
638    m_nodeTests.remove(t);
639#endif
640}
641
642