1/* 2 * Copyright 2005 Maksim Orlovich <maksim@kde.org> 3 * Copyright (C) 2006 Apple Computer, Inc. 4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "XPathParser.h" 30 31#if ENABLE(XPATH) 32 33#include "ExceptionCode.h" 34#include "XPathEvaluator.h" 35#include "XPathException.h" 36#include "XPathNSResolver.h" 37#include "XPathPath.h" 38#include "XPathStep.h" 39#include <wtf/StdLibExtras.h> 40#include <wtf/text/StringHash.h> 41 42using namespace WebCore; 43using namespace WTF; 44using namespace Unicode; 45using namespace XPath; 46 47extern int xpathyyparse(WebCore::XPath::Parser*); 48#include "XPathGrammar.h" 49 50Parser* Parser::currentParser = 0; 51 52enum XMLCat { NameStart, NameCont, NotPartOfName }; 53 54typedef HashMap<String, Step::Axis> AxisNamesMap; 55 56static XMLCat charCat(UChar aChar) 57{ 58 //### might need to add some special cases from the XML spec. 59 60 if (aChar == '_') 61 return NameStart; 62 63 if (aChar == '.' || aChar == '-') 64 return NameCont; 65 CharCategory category = Unicode::category(aChar); 66 if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter)) 67 return NameStart; 68 if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit)) 69 return NameCont; 70 return NotPartOfName; 71} 72 73static void setUpAxisNamesMap(AxisNamesMap& axisNames) 74{ 75 struct AxisName { 76 const char* name; 77 Step::Axis axis; 78 }; 79 const AxisName axisNameList[] = { 80 { "ancestor", Step::AncestorAxis }, 81 { "ancestor-or-self", Step::AncestorOrSelfAxis }, 82 { "attribute", Step::AttributeAxis }, 83 { "child", Step::ChildAxis }, 84 { "descendant", Step::DescendantAxis }, 85 { "descendant-or-self", Step::DescendantOrSelfAxis }, 86 { "following", Step::FollowingAxis }, 87 { "following-sibling", Step::FollowingSiblingAxis }, 88 { "namespace", Step::NamespaceAxis }, 89 { "parent", Step::ParentAxis }, 90 { "preceding", Step::PrecedingAxis }, 91 { "preceding-sibling", Step::PrecedingSiblingAxis }, 92 { "self", Step::SelfAxis } 93 }; 94 for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i) 95 axisNames.set(axisNameList[i].name, axisNameList[i].axis); 96} 97 98static bool isAxisName(const String& name, Step::Axis& type) 99{ 100 DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ()); 101 102 if (axisNames.isEmpty()) 103 setUpAxisNamesMap(axisNames); 104 105 AxisNamesMap::iterator it = axisNames.find(name); 106 if (it == axisNames.end()) 107 return false; 108 type = it->second; 109 return true; 110} 111 112static bool isNodeTypeName(const String& name) 113{ 114 DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ()); 115 if (nodeTypeNames.isEmpty()) { 116 nodeTypeNames.add("comment"); 117 nodeTypeNames.add("text"); 118 nodeTypeNames.add("processing-instruction"); 119 nodeTypeNames.add("node"); 120 } 121 return nodeTypeNames.contains(name); 122} 123 124// Returns whether the current token can possibly be a binary operator, given 125// the previous token. Necessary to disambiguate some of the operators 126// (* (multiply), div, and, or, mod) in the [32] Operator rule 127// (check http://www.w3.org/TR/xpath#exprlex). 128bool Parser::isBinaryOperatorContext() const 129{ 130 switch (m_lastTokenType) { 131 case 0: 132 case '@': case AXISNAME: case '(': case '[': case ',': 133 case AND: case OR: case MULOP: 134 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: 135 case EQOP: case RELOP: 136 return false; 137 default: 138 return true; 139 } 140} 141 142void Parser::skipWS() 143{ 144 while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos])) 145 ++m_nextPos; 146} 147 148Token Parser::makeTokenAndAdvance(int code, int advance) 149{ 150 m_nextPos += advance; 151 return Token(code); 152} 153 154Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance) 155{ 156 m_nextPos += advance; 157 return Token(code, val); 158} 159 160Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance) 161{ 162 m_nextPos += advance; 163 return Token(code, val); 164} 165 166// Returns next char if it's there and interesting, 0 otherwise 167char Parser::peekAheadHelper() 168{ 169 if (m_nextPos + 1 >= m_data.length()) 170 return 0; 171 UChar next = m_data[m_nextPos + 1]; 172 if (next >= 0xff) 173 return 0; 174 return next; 175} 176 177char Parser::peekCurHelper() 178{ 179 if (m_nextPos >= m_data.length()) 180 return 0; 181 UChar next = m_data[m_nextPos]; 182 if (next >= 0xff) 183 return 0; 184 return next; 185} 186 187Token Parser::lexString() 188{ 189 UChar delimiter = m_data[m_nextPos]; 190 int startPos = m_nextPos + 1; 191 192 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { 193 if (m_data[m_nextPos] == delimiter) { 194 String value = m_data.substring(startPos, m_nextPos - startPos); 195 if (value.isNull()) 196 value = ""; 197 ++m_nextPos; // Consume the char. 198 return Token(LITERAL, value); 199 } 200 } 201 202 // Ouch, went off the end -- report error. 203 return Token(XPATH_ERROR); 204} 205 206Token Parser::lexNumber() 207{ 208 int startPos = m_nextPos; 209 bool seenDot = false; 210 211 // Go until end or a non-digits character. 212 for (; m_nextPos < m_data.length(); ++m_nextPos) { 213 UChar aChar = m_data[m_nextPos]; 214 if (aChar >= 0xff) break; 215 216 if (aChar < '0' || aChar > '9') { 217 if (aChar == '.' && !seenDot) 218 seenDot = true; 219 else 220 break; 221 } 222 } 223 224 return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos)); 225} 226 227bool Parser::lexNCName(String& name) 228{ 229 int startPos = m_nextPos; 230 if (m_nextPos >= m_data.length()) 231 return false; 232 233 if (charCat(m_data[m_nextPos]) != NameStart) 234 return false; 235 236 // Keep going until we get a character that's not good for names. 237 for (; m_nextPos < m_data.length(); ++m_nextPos) 238 if (charCat(m_data[m_nextPos]) == NotPartOfName) 239 break; 240 241 name = m_data.substring(startPos, m_nextPos - startPos); 242 return true; 243} 244 245bool Parser::lexQName(String& name) 246{ 247 String n1; 248 if (!lexNCName(n1)) 249 return false; 250 251 skipWS(); 252 253 // If the next character is :, what we just got it the prefix, if not, 254 // it's the whole thing. 255 if (peekAheadHelper() != ':') { 256 name = n1; 257 return true; 258 } 259 260 String n2; 261 if (!lexNCName(n2)) 262 return false; 263 264 name = n1 + ":" + n2; 265 return true; 266} 267 268Token Parser::nextTokenInternal() 269{ 270 skipWS(); 271 272 if (m_nextPos >= m_data.length()) 273 return Token(0); 274 275 char code = peekCurHelper(); 276 switch (code) { 277 case '(': case ')': case '[': case ']': 278 case '@': case ',': case '|': 279 return makeTokenAndAdvance(code); 280 case '\'': 281 case '\"': 282 return lexString(); 283 case '0': case '1': case '2': case '3': case '4': 284 case '5': case '6': case '7': case '8': case '9': 285 return lexNumber(); 286 case '.': { 287 char next = peekAheadHelper(); 288 if (next == '.') 289 return makeTokenAndAdvance(DOTDOT, 2); 290 if (next >= '0' && next <= '9') 291 return lexNumber(); 292 return makeTokenAndAdvance('.'); 293 } 294 case '/': 295 if (peekAheadHelper() == '/') 296 return makeTokenAndAdvance(SLASHSLASH, 2); 297 return makeTokenAndAdvance('/'); 298 case '+': 299 return makeTokenAndAdvance(PLUS); 300 case '-': 301 return makeTokenAndAdvance(MINUS); 302 case '=': 303 return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ); 304 case '!': 305 if (peekAheadHelper() == '=') 306 return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2); 307 return Token(XPATH_ERROR); 308 case '<': 309 if (peekAheadHelper() == '=') 310 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2); 311 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT); 312 case '>': 313 if (peekAheadHelper() == '=') 314 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2); 315 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT); 316 case '*': 317 if (isBinaryOperatorContext()) 318 return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul); 319 ++m_nextPos; 320 return Token(NAMETEST, "*"); 321 case '$': { // $ QName 322 m_nextPos++; 323 String name; 324 if (!lexQName(name)) 325 return Token(XPATH_ERROR); 326 return Token(VARIABLEREFERENCE, name); 327 } 328 } 329 330 String name; 331 if (!lexNCName(name)) 332 return Token(XPATH_ERROR); 333 334 skipWS(); 335 // If we're in an operator context, check for any operator names 336 if (isBinaryOperatorContext()) { 337 if (name == "and") //### hash? 338 return Token(AND); 339 if (name == "or") 340 return Token(OR); 341 if (name == "mod") 342 return Token(MULOP, NumericOp::OP_Mod); 343 if (name == "div") 344 return Token(MULOP, NumericOp::OP_Div); 345 } 346 347 // See whether we are at a : 348 if (peekCurHelper() == ':') { 349 m_nextPos++; 350 // Any chance it's an axis name? 351 if (peekCurHelper() == ':') { 352 m_nextPos++; 353 354 //It might be an axis name. 355 Step::Axis axis; 356 if (isAxisName(name, axis)) 357 return Token(AXISNAME, axis); 358 // Ugh, :: is only valid in axis names -> error 359 return Token(XPATH_ERROR); 360 } 361 362 // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest 363 skipWS(); 364 if (peekCurHelper() == '*') { 365 m_nextPos++; 366 return Token(NAMETEST, name + ":*"); 367 } 368 369 // Make a full qname. 370 String n2; 371 if (!lexNCName(n2)) 372 return Token(XPATH_ERROR); 373 374 name = name + ":" + n2; 375 } 376 377 skipWS(); 378 if (peekCurHelper() == '(') { 379 //note: we don't swallow the (here! 380 381 //either node type of function name 382 if (isNodeTypeName(name)) { 383 if (name == "processing-instruction") 384 return Token(PI, name); 385 386 return Token(NODETYPE, name); 387 } 388 //must be a function name. 389 return Token(FUNCTIONNAME, name); 390 } 391 392 // At this point, it must be NAMETEST. 393 return Token(NAMETEST, name); 394} 395 396Token Parser::nextToken() 397{ 398 Token toRet = nextTokenInternal(); 399 m_lastTokenType = toRet.type; 400 return toRet; 401} 402 403Parser::Parser() 404{ 405 reset(String()); 406} 407 408Parser::~Parser() 409{ 410} 411 412void Parser::reset(const String& data) 413{ 414 m_nextPos = 0; 415 m_data = data; 416 m_lastTokenType = 0; 417 418 m_topExpr = 0; 419 m_gotNamespaceError = false; 420} 421 422int Parser::lex(void* data) 423{ 424 YYSTYPE* yylval = static_cast<YYSTYPE*>(data); 425 Token tok = nextToken(); 426 427 switch (tok.type) { 428 case AXISNAME: 429 yylval->axis = tok.axis; 430 break; 431 case MULOP: 432 yylval->numop = tok.numop; 433 break; 434 case RELOP: 435 case EQOP: 436 yylval->eqop = tok.eqop; 437 break; 438 case NODETYPE: 439 case PI: 440 case FUNCTIONNAME: 441 case LITERAL: 442 case VARIABLEREFERENCE: 443 case NUMBER: 444 case NAMETEST: 445 yylval->str = new String(tok.str); 446 registerString(yylval->str); 447 break; 448 } 449 450 return tok.type; 451} 452 453bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI) 454{ 455 size_t colon = qName.find(':'); 456 if (colon != notFound) { 457 if (!m_resolver) 458 return false; 459 namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon)); 460 if (namespaceURI.isNull()) 461 return false; 462 localName = qName.substring(colon + 1); 463 } else 464 localName = qName; 465 466 return true; 467} 468 469Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec) 470{ 471 reset(statement); 472 473 m_resolver = resolver; 474 475 Parser* oldParser = currentParser; 476 currentParser = this; 477 int parseError = xpathyyparse(this); 478 currentParser = oldParser; 479 480 if (parseError) { 481 deleteAllValues(m_parseNodes); 482 m_parseNodes.clear(); 483 484 HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end(); 485 for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) { 486 deleteAllValues(**it); 487 delete *it; 488 } 489 m_predicateVectors.clear(); 490 491 HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end(); 492 for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) { 493 deleteAllValues(**it); 494 delete *it; 495 } 496 m_expressionVectors.clear(); 497 498 deleteAllValues(m_strings); 499 m_strings.clear(); 500 501 deleteAllValues(m_nodeTests); 502 m_nodeTests.clear(); 503 504 m_topExpr = 0; 505 506 if (m_gotNamespaceError) 507 ec = NAMESPACE_ERR; 508 else 509 ec = XPathException::INVALID_EXPRESSION_ERR; 510 return 0; 511 } 512 513 ASSERT(m_parseNodes.size() == 1); 514 ASSERT(*m_parseNodes.begin() == m_topExpr); 515 ASSERT(m_expressionVectors.size() == 0); 516 ASSERT(m_predicateVectors.size() == 0); 517 ASSERT(m_strings.size() == 0); 518 ASSERT(m_nodeTests.size() == 0); 519 520 m_parseNodes.clear(); 521 Expression* result = m_topExpr; 522 m_topExpr = 0; 523 524 return result; 525} 526 527void Parser::registerParseNode(ParseNode* node) 528{ 529 if (node == 0) 530 return; 531 532 ASSERT(!m_parseNodes.contains(node)); 533 534 m_parseNodes.add(node); 535} 536 537void Parser::unregisterParseNode(ParseNode* node) 538{ 539 if (node == 0) 540 return; 541 542 ASSERT(m_parseNodes.contains(node)); 543 544 m_parseNodes.remove(node); 545} 546 547void Parser::registerPredicateVector(Vector<Predicate*>* vector) 548{ 549 if (vector == 0) 550 return; 551 552 ASSERT(!m_predicateVectors.contains(vector)); 553 554 m_predicateVectors.add(vector); 555} 556 557void Parser::deletePredicateVector(Vector<Predicate*>* vector) 558{ 559 if (vector == 0) 560 return; 561 562 ASSERT(m_predicateVectors.contains(vector)); 563 564 m_predicateVectors.remove(vector); 565 delete vector; 566} 567 568 569void Parser::registerExpressionVector(Vector<Expression*>* vector) 570{ 571 if (vector == 0) 572 return; 573 574 ASSERT(!m_expressionVectors.contains(vector)); 575 576 m_expressionVectors.add(vector); 577} 578 579void Parser::deleteExpressionVector(Vector<Expression*>* vector) 580{ 581 if (vector == 0) 582 return; 583 584 ASSERT(m_expressionVectors.contains(vector)); 585 586 m_expressionVectors.remove(vector); 587 delete vector; 588} 589 590void Parser::registerString(String* s) 591{ 592 if (s == 0) 593 return; 594 595 ASSERT(!m_strings.contains(s)); 596 597 m_strings.add(s); 598} 599 600void Parser::deleteString(String* s) 601{ 602 if (s == 0) 603 return; 604 605 ASSERT(m_strings.contains(s)); 606 607 m_strings.remove(s); 608 delete s; 609} 610 611void Parser::registerNodeTest(Step::NodeTest* t) 612{ 613 if (t == 0) 614 return; 615 616 ASSERT(!m_nodeTests.contains(t)); 617 618 m_nodeTests.add(t); 619} 620 621void Parser::deleteNodeTest(Step::NodeTest* t) 622{ 623 if (t == 0) 624 return; 625 626 ASSERT(m_nodeTests.contains(t)); 627 628 m_nodeTests.remove(t); 629 delete t; 630} 631 632#endif // ENABLE(XPATH) 633