15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2006 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Regular expression parser. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The parser is a simple precedence-based parser with a 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// manual stack. The parsing work is done by the methods 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of the ParseState class. The Regexp::Parse function is 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// essentially just a lexer that calls the ParseState method 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for each token. 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The parser recognizes POSIX extended regular expressions 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// excluding backreferences, collating elements, and collating 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// classes. It also allows the empty string as a regular expression 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See regexp.h for rationale. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h" 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/stringpiece.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/unicode_casefold.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/unicode_groups.h" 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Regular expression parse state. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The list of parsed regexps so far is maintained as a vector of 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Regexp pointers called the stack. Left parenthesis and vertical 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// bar markers are also placed on the stack, as Regexps with 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// non-standard opcodes. 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scanning a left parenthesis causes the parser to push a left parenthesis 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// marker on the stack. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scanning a vertical bar causes the parser to pop the stack until it finds a 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// vertical bar or left parenthesis marker (not popping the marker), 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// concatenate all the popped results, and push them back on 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the stack (DoConcatenation). 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scanning a right parenthesis causes the parser to act as though it 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// has seen a vertical bar, which then leaves the top of the stack in the 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The parser pops all this off the stack and creates an alternation of the 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regexps (DoAlternation). 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Regexp::ParseState { 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ParseState(ParseFlags flags, const StringPiece& whole_regexp, 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~ParseState(); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ParseFlags flags() { return flags_; } 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int rune_max() { return rune_max_; } 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse methods. All public methods return a bool saying 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // whether parsing should continue. If a method returns 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // false, it has set fields in *status_, and the parser 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // should return NULL. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes the given regular expression onto the stack. 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Could check for too much memory used here. 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushRegexp(Regexp* re); 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes the literal rune r onto the stack. 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushLiteral(Rune r); 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a regexp with the given op (and no args) onto the stack. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushSimpleOp(RegexpOp op); 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a ^ onto the stack. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushCarat(); 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a \b (word == true) or \B (word == false) onto the stack. 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushWordBoundary(bool word); 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a $ onto the stack. 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushDollar(); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a . onto the stack 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushDot(); 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a repeat operator regexp onto the stack. 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A valid argument for the operator must already be on the stack. 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // s is the name of the operator, for use in error messages. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a repetition regexp onto the stack. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A valid argument for the operator must already be on the stack. 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Checks whether a particular regexp op is a marker. 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool IsMarker(RegexpOp op); 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Processes a left parenthesis in the input. 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pushes a marker onto the stack. 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool DoLeftParen(const StringPiece& name); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool DoLeftParenNoCapture(); 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Processes a vertical bar in the input. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool DoVerticalBar(); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Processes a right parenthesis in the input. 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool DoRightParen(); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Processes the end of input, returning the final regexp. 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* DoFinish(); 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finishes the regexp if necessary, preparing it for use 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in a more complicated expression. 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If it is a CharClassBuilder, converts into a CharClass. 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* FinishRegexp(Regexp*); 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These routines don't manipulate the parse stack 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // directly, but they do need to look at flags_. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ParseCharClass also manipulates the internals of Regexp 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // while creating *out_re. 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse a character class into *out_re. 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Removes parsed text from s. 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ParseCharClass(StringPiece* s, Regexp** out_re, 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status); 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse a character class character into *rp. 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Removes parsed text from s. 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ParseCCCharacter(StringPiece* s, Rune *rp, 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& whole_class, 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status); 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse a character class range into rr. 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Removes parsed text from s. 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ParseCCRange(StringPiece* s, RuneRange* rr, 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& whole_class, 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse a Perl flag set or non-capturing group from s. 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ParsePerlFlags(StringPiece* s); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finishes the current concatenation, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // collapsing it into a single regexp on the stack. 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void DoConcatenation(); 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finishes the current alternation, 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // collapsing it to a single regexp on the stack. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void DoAlternation(); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Generalized DoAlternation/DoConcatenation. 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void DoCollapse(RegexpOp op); 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Maybe concatenate Literals into LiteralString. 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool MaybeConcatString(int r, ParseFlags flags); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)private: 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ParseFlags flags_; 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece whole_regexp_; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status_; 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* stacktop_; 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ncap_; // number of capturing parens seen 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int rune_max_; // maximum char value for this encoding 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(ParseState); 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pseudo-operators - only on parse stack. 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp::ParseState::ParseState(ParseFlags flags, 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& whole_regexp, 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : flags_(flags), whole_regexp_(whole_regexp), 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_(status), stacktop_(NULL), ncap_(0) { 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (flags_ & Latin1) 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_max_ = 0xFF; 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_max_ = Runemax; 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Cleans up by freeing all the regexps on the stack. 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp::ParseState::~ParseState() { 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* next; 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Regexp* re = stacktop_; re != NULL; re = next) { 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = re->down_; 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = NULL; 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kLeftParen) 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete re->name_; 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Finishes the regexp if necessary, preparing it for use in 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a more complex expression. 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If it is a CharClassBuilder, converts into a CharClass. 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re == NULL) 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = NULL; 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClassBuilder* ccb = re->ccb_; 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = NULL; 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->cc_ = ccb->GetCharClass(); 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete ccb; 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes the given regular expression onto the stack. 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Could check for too much memory used here. 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushRegexp(Regexp* re) { 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MaybeConcatString(-1, NoParseFlags); 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: a character class of one character is just 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a literal. This is a common idiom for escaping 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // single characters (e.g., [.] instead of \.), and some 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // analysis does better with fewer character classes. 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op_ == kRegexpCharClass) { 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->ccb_->size() == 1) { 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r = re->ccb_->begin()->lo; 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = new Regexp(kRegexpLiteral, flags_); 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->rune_ = r; 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (re->ccb_->size() == 2) { 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r = re->ccb_->begin()->lo; 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = new Regexp(kRegexpLiteral, flags_ | FoldCase); 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->rune_ = r + 'a' - 'A'; 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsMarker(re->op())) 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = re->ComputeSimple(); 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = stacktop_; 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = re; 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Searches the case folding tables and returns the CaseFold* that contains r. 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If there isn't one, returns NULL. 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CaseFold* ef = f + n; 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Binary search for entry containing r. 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (n > 0) { 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int m = n/2; 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (f[m].lo <= r && r <= f[m].hi) 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return &f[m]; 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r < f[m].lo) { 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = m; 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f += m+1; 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n -= m+1; 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // There is no entry that contains r, but f points 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // where it would have been. Unless f points at 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the end of the array, it points at the next entry 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // after r. 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (f < ef) 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No entry contains r; no entry contains runes > r. 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the result of applying the fold f to the rune r. 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Rune ApplyFold(CaseFold *f, Rune r) { 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (f->delta) { 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r + f->delta; 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case EvenOddSkip: // even <-> odd but only applies to every other 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((r - f->lo) % 2) 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r; 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall through 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case EvenOdd: // even <-> odd 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r%2 == 0) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r + 1; 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r - 1; 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OddEvenSkip: // odd <-> even but only applies to every other 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((r - f->lo) % 2) 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r; 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall through 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OddEven: // odd <-> even 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r%2 == 1) 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r + 1; 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r - 1; 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the next Rune in r's folding cycle (see unicode_casefold.h). 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Examples: 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune('A') = 'a' 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune('a') = 'A' 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune('K') = 'k' 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune('k') = 0x212A (Kelvin) 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune(0x212A) = 'K' 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CycleFoldRune('?') = '?' 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Rune CycleFoldRune(Rune r) { 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (f == NULL || r < f->lo) 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return r; 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ApplyFold(f, r); 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Add lo-hi to the class, along with their fold-equivalent characters. 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If lo-hi is already in the class, assume that the fold-equivalent 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// chars are there too, so there's no work to do. 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AddFoldedRange calls itself recursively for each rune in the fold cycle. 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Most folding cycles are small: there aren't any bigger than four in the 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // current Unicode tables. make_unicode_casefold.py checks that 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the cycles are not too long, and we double-check here using depth. 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (depth > 10) { 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "AddFoldedRange recurses too much."; 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (lo <= hi) { 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (f == NULL) // lo has no fold, nor does anything above lo 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo = f->lo; 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add in the result of folding the range lo - f->hi 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and that range's fold, recursively. 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune lo1 = lo; 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune hi1 = min<Rune>(hi, f->hi); 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (f->delta) { 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo1 += f->delta; 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hi1 += f->delta; 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case EvenOdd: 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo1%2 == 1) 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo1--; 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi1%2 == 0) 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hi1++; 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case OddEven: 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo1%2 == 0) 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo1--; 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi1%2 == 1) 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hi1++; 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddFoldedRange(cc, lo1, hi1, depth+1); 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pick up where this fold left off. 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo = f->hi + 1; 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes the literal rune r onto the stack. 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushLiteral(Rune r) { 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do case folding if needed. 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = new CharClassBuilder; 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r1 = r; 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) do { 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(flags_ & NeverNL) || r != '\n') { 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->AddRange(r, r); 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r = CycleFoldRune(r); 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } while (r != r1); 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->RemoveAbove(rune_max_); 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Exclude newline if applicable. 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((flags_ & NeverNL) && r == '\n') 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No fancy stuff worked. Ordinary literal. 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (MaybeConcatString(r, flags_)) 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpLiteral, flags_); 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->rune_ = r; 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a ^ onto the stack. 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushCarat() { 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (flags_ & OneLine) { 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpBeginText); 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpBeginLine); 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a \b or \B onto the stack. 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushWordBoundary(bool word) { 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word) 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpWordBoundary); 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpNoWordBoundary); 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a $ onto the stack. 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushDollar() { 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (flags_ & OneLine) { 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clumsy marker so that MimicsPCRE() can tell whether 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this kRegexpEndText was a $ and not a \z. 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags oflags = flags_; 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags_ = flags_ | WasDollar; 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ret = PushSimpleOp(kRegexpEndText); 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags_ = oflags; 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ret; 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpEndLine); 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a . onto the stack. 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushDot() { 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((flags_ & DotNL) && !(flags_ & NeverNL)) 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kRegexpAnyChar); 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Rewrite . into [^\n] 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = new CharClassBuilder; 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->AddRange(0, '\n' - 1); 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->AddRange('\n' + 1, rune_max_); 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a regexp with the given op (and no args) onto the stack. 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(op, flags_); 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a repeat operator regexp onto the stack. 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A valid argument for the operator must already be on the stack. 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The char c is the name of the operator, for use in error messages. 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool nongreedy) { 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpRepeatArgument); 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(s); 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags fl = flags_; 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nongreedy) 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fl = fl ^ NonGreedy; 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(op, fl); 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->AllocSub(1); 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = stacktop_->down_; 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->sub()[0] = FinishRegexp(stacktop_); 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = re->ComputeSimple(); 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = re; 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a repetition regexp onto the stack. 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A valid argument for the operator must already be on the stack. 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::PushRepetition(int min, int max, 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& s, 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool nongreedy) { 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((max != -1 && max < min) || min > 1000 || max > 1000) { 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpRepeatSize); 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(s); 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpRepeatArgument); 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(s); 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags fl = flags_; 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nongreedy) 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fl = fl ^ NonGreedy; 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpRepeat, fl); 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->min_ = min; 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->max_ = max; 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->AllocSub(1); 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = stacktop_->down_; 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->sub()[0] = FinishRegexp(stacktop_); 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = re->ComputeSimple(); 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = re; 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Checks whether a particular regexp op is a marker. 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::IsMarker(RegexpOp op) { 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return op >= kLeftParen; 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes a left parenthesis in the input. 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a marker onto the stack. 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kLeftParen, flags_); 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->cap_ = ++ncap_; 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (name.data() != NULL) 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->name_ = new string(name.as_string()); 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pushes a non-capturing marker onto the stack. 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::DoLeftParenNoCapture() { 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kLeftParen, flags_); 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->cap_ = -1; 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Adds r to cc, along with r's upper case if foldascii is set. 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRange(r, r); 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (foldascii && 'a' <= r && r <= 'z') 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes a vertical bar in the input. 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::DoVerticalBar() { 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MaybeConcatString(-1, NoParseFlags); 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoConcatenation(); 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Below the vertical bar is a list to alternate. 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Above the vertical bar is a list to concatenate. 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We just did the concatenation, so either swap 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the result below the vertical bar or push a new 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // vertical bar on the stack. 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r1; 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r2; 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((r1 = stacktop_) != NULL && 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (r2 = stacktop_->down_) != NULL && 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r2->op() == kVerticalBar) { 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If above and below vertical bar are literal or char class, 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // can merge into a single char class. 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r3; 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((r1->op() == kRegexpLiteral || 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->op() == kRegexpCharClass || 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->op() == kRegexpAnyChar) && 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (r3 = r2->down_) != NULL) { 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune rune; 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (r3->op()) { 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteral: // convert to char class 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune = r3->rune_; 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->op_ = kRegexpCharClass; 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->cc_ = NULL; 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->ccb_ = new CharClassBuilder; 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall through 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCharClass: 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r1->op() == kRegexpLiteral) 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddLiteral(r3->ccb_, r1->rune_, 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->parse_flags_ & Regexp::FoldCase); 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else if (r1->op() == kRegexpCharClass) 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->ccb_->AddCharClass(r1->ccb_); 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete r3->ccb_; 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->ccb_ = NULL; 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r3->op_ = kRegexpAnyChar; 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall through 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyChar: 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // pop r1 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = r2; 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->Decref(); 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Swap r1 below vertical bar (r2). 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->down_ = r2->down_; 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r2->down_ = r1; 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = r2; 5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushSimpleOp(kVerticalBar); 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes a right parenthesis in the input. 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::DoRightParen() { 5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finish the current concatenation and alternation. 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoAlternation(); 5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The stack should be: LeftParen regexp 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Remove the LeftParen, leaving the regexp, 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // parenthesized. 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r1; 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r2; 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((r1 = stacktop_) == NULL || 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (r2 = r1->down_) == NULL || 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r2->op() != kLeftParen) { 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpMissingParen); 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(whole_regexp_); 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pop off r1, r2. Will Decref or reuse below. 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = r2->down_; 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Restore flags from when paren opened. 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = r2; 6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags_ = re->parse_flags(); 6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Rewrite LeftParen as capture if needed. 6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->cap_ > 0) { 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->op_ = kRegexpCapture; 6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // re->cap_ is already set 6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->AllocSub(1); 6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->sub()[0] = FinishRegexp(r1); 6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = re->ComputeSimple(); 6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = r1; 6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return PushRegexp(re); 6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes the end of input, returning the final regexp. 6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::ParseState::DoFinish() { 6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoAlternation(); 6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = stacktop_; 6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re != NULL && re->down_ != NULL) { 6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpMissingParen); 6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(whole_regexp_); 6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = NULL; 6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return FinishRegexp(re); 6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the leading regexp that re starts with. 6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The returned Regexp* points into a piece of re, 6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// so it must not be used after the caller calls re->Decref(). 6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::LeadingRegexp(Regexp* re) { 6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpEmptyMatch) 6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpConcat && re->nsub() >= 2) { 6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub = re->sub(); 6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sub[0]->op() == kRegexpEmptyMatch) 6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return sub[0]; 6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Removes LeadingRegexp(re) from re and returns what's left. 6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Consumes the reference to re and may edit it in place. 6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If caller wants to hold on to LeadingRegexp(re), 6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// must have already Incref'ed it. 6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { 6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpEmptyMatch) 6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpConcat && re->nsub() >= 2) { 6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub = re->sub(); 6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sub[0]->op() == kRegexpEmptyMatch) 6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[0]->Decref(); 6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[0] = NULL; 6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->nsub() == 2) { 6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Collapse concatenation to single regexp. 6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = sub[1]; 6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[1] = NULL; 6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 3 or more -> 2 or more. 6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->nsub_--; 6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags pf = re->parse_flags(); 6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new Regexp(kRegexpEmptyMatch, pf); 6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the leading string that re starts with. 6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The returned Rune* points into a piece of re, 6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// so it must not be used after the caller calls re->Decref(). 6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Rune* Regexp::LeadingString(Regexp* re, int *nrune, 6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags *flags) { 6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (re->op() == kRegexpConcat && re->nsub() > 0) 6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = re->sub()[0]; 6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); 6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpLiteral) { 6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *nrune = 1; 6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return &re->rune_; 6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpLiteralString) { 7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *nrune = re->nrunes_; 7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->runes_; 7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *nrune = 0; 7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Removes the first n leading runes from the beginning of re. 7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Edits re in place. 7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Regexp::RemoveLeadingString(Regexp* re, int n) { 7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Chase down concats to find first string. 7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For regexps generated by parser, nested concats are 7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // flattened except when doing so would overflow the 16-bit 7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // limit on the size of a concatenation, so we should never 7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // see more than two here. 7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* stk[4]; 7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int d = 0; 7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (re->op() == kRegexpConcat) { 7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (d < arraysize(stk)) 7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[d++] = re; 7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = re->sub()[0]; 7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Remove leading string from re. 7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpLiteral) { 7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->rune_ = 0; 7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->op_ = kRegexpEmptyMatch; 7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (re->op() == kRegexpLiteralString) { 7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n >= re->nrunes_) { 7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] re->runes_; 7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->runes_ = NULL; 7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->nrunes_ = 0; 7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->op_ = kRegexpEmptyMatch; 7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (n == re->nrunes_ - 1) { 7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune rune = re->runes_[re->nrunes_ - 1]; 7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] re->runes_; 7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->runes_ = NULL; 7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->nrunes_ = 0; 7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->rune_ = rune; 7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->op_ = kRegexpLiteral; 7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->nrunes_ -= n; 7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); 7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If re is now empty, concatenations might simplify too. 7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (d-- > 0) { 7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re = stk[d]; 7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub = re->sub(); 7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sub[0]->op() == kRegexpEmptyMatch) { 7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[0]->Decref(); 7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[0] = NULL; 7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Delete first element of concat. 7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (re->nsub()) { 7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 0: 7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 1: 7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Impossible. 7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Concat of " << re->nsub(); 7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->submany_ = NULL; 7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->op_ = kRegexpEmptyMatch; 7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 2: { 7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Replace re with sub[1]. 7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* old = sub[1]; 7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[1] = NULL; 7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Swap(old); 7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) old->Decref(); 7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Slide down. 7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->nsub_--; 7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Factors common prefixes from alternation. 7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, 7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ABC|ABD|AEF|BCX|BCY 7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// simplifies to 7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A(B(C|D)|EF)|BC(X|Y) 7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which the normal parse state routines will further simplify to 7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A(B[CD]|EF)|BC[XY] 7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Rewrites sub to contain simplified list to alternate and returns 7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the new length of sub. Adjusts reference counts accordingly 7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (incoming sub[i] decremented, outgoing sub[i] incremented). 7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It's too much of a pain to write this code with an explicit stack, 7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// so instead we let the caller specify a maximum depth and 7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// don't simplify beyond that. There are around 15 words of local 7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// variables and parameters in the frame, so allowing 8 levels 7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// on a 64-bit machine is still less than a kilobyte of stack and 8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// probably enough benefit for practical uses. 8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kFactorAlternationMaxDepth = 8; 8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Regexp::FactorAlternation( 8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub, int n, 8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags altflags) { 8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return FactorAlternationRecursive(sub, n, altflags, 8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFactorAlternationMaxDepth); 8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Regexp::FactorAlternationRecursive( 8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub, int n, 8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags altflags, 8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxdepth) { 8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (maxdepth <= 0) 8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return n; 8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Round 1: Factor out common literal prefixes. 8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune *rune = NULL; 8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nrune = 0; 8215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags runeflags = Regexp::NoParseFlags; 8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int start = 0; 8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int out = 0; 8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i <= n; i++) { 8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: what was in sub[0:start] has been Decref'ed 8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and that space has been reused for sub[0:out] (out <= start). 8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: sub[start:i] consists of regexps that all begin 8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // with the string rune[0:nrune]. 8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune* rune_i = NULL; 8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nrune_i = 0; 8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; 8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n) { 8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); 8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (runeflags_i == runeflags) { 8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int same = 0; 8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) 8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) same++; 8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (same > 0) { 8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Matches at least one rune in current range. Keep going around. 8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nrune = same; 8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Found end of a run with common leading literal string: 8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // sub[start:i] all begin with rune[0:nrune] but sub[i] 8505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // does not even begin with rune[0]. 8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Factor out common string and append factored expression to sub[0:out]. 8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i == start) { 8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Nothing to do - first iteration. 8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (i == start+1) { 8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Just one: don't bother factoring. 8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = sub[start]; 8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Construct factored form: prefix(suffix1|suffix2|...) 8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) x[0] = LiteralString(rune, nrune, runeflags); 8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int j = start; j < i; j++) 8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RemoveLeadingString(sub[j], nrune); 8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) maxdepth - 1); 8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) x[1] = AlternateNoFactor(sub + start, nn, altflags); 8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = Concat(x, 2, altflags); 8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Prepare for next round (if there is one). 8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n) { 8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = i; 8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune = rune_i; 8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nrune = nrune_i; 8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) runeflags = runeflags_i; 8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = out; 8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Round 2: Factor out common complex prefixes, 8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // just the first piece of each concatenation, 8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // whatever it is. This is good enough a lot of the time. 8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = 0; 8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) out = 0; 8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* first = NULL; 8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i <= n; i++) { 8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: what was in sub[0:start] has been Decref'ed 8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and that space has been reused for sub[0:out] (out <= start). 8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: sub[start:i] consists of regexps that all begin with first. 8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* first_i = NULL; 8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n) { 8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) first_i = LeadingRegexp(sub[i]); 8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (first != NULL && Regexp::Equal(first, first_i)) { 8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Found end of a run with common leading regexp: 9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // sub[start:i] all begin with first but sub[i] does not. 9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Factor out common regexp and append factored expression to sub[0:out]. 9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i == start) { 9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Nothing to do - first iteration. 9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (i == start+1) { 9075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Just one: don't bother factoring. 9085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = sub[start]; 9095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 9105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Construct factored form: prefix(suffix1|suffix2|...) 9115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 9125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) x[0] = first->Incref(); 9135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int j = start; j < i; j++) 9145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[j] = RemoveLeadingRegexp(sub[j]); 9155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 9165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) maxdepth - 1); 9175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) x[1] = AlternateNoFactor(sub + start, nn, altflags); 9185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = Concat(x, 2, altflags); 9195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Prepare for next round (if there is one). 9225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n) { 9235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = i; 9245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) first = first_i; 9255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = out; 9285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Round 3: Collapse runs of single literals into character classes. 9305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = 0; 9315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) out = 0; 9325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i <= n; i++) { 9335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: what was in sub[0:start] has been Decref'ed 9345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and that space has been reused for sub[0:out] (out <= start). 9355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 9365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Invariant: sub[start:i] consists of regexps that are either 9375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // literal runes or character classes. 9385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n && 9405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (sub[i]->op() == kRegexpLiteral || 9415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[i]->op() == kRegexpCharClass)) 9425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 9435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // sub[i] is not a char or char class; 9455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // emit char class for sub[start:i]... 9465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i == start) { 9475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Nothing to do. 9485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (i == start+1) { 9495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = sub[start]; 9505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 9515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make new char class. 9525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClassBuilder ccb; 9535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int j = start; j < i; j++) { 9545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = sub[j]; 9555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == kRegexpCharClass) { 9565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClass* cc = re->cc(); 9575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) 9585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ccb.AddRange(it->lo, it->hi); 9595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (re->op() == kRegexpLiteral) { 9605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); 9615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 9625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " 9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << re->ToString(); 9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); 9685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ... and then emit sub[i]. 9715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i < n) 9725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = sub[i]; 9735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = i+1; 9745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = out; 9765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Round 4: Collapse runs of empty matches into single empty match. 9785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = 0; 9795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) out = 0; 9805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < n; i++) { 9815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i + 1 < n && 9825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[i]->op() == kRegexpEmptyMatch && 9835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[i+1]->op() == kRegexpEmptyMatch) { 9845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[i]->Decref(); 9855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 9865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub[out++] = sub[i]; 9885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = out; 9905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return n; 9925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Collapse the regexps on top of the stack, down to the 9955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// first marker, into a new op node (op == kRegexpAlternate 9965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or op == kRegexpConcat). 9975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Regexp::ParseState::DoCollapse(RegexpOp op) { 9985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Scan backward to marker, counting children of composite. 9995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = 0; 10005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* next = NULL; 10015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sub; 10025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 10035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = sub->down_; 10045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sub->op_ == op) 10055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n += sub->nsub_; 10065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 10075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n++; 10085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If there's just one child, leave it alone. 10115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (Concat of one thing is that one thing; alternate of one thing is same.) 10125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (stacktop_ != NULL && stacktop_->down_ == next) 10135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 10145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Construct op (alternation or concatenation), flattening op of op. 10165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subs = new Regexp*[n]; 10175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = NULL; 10185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int i = n; 10195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 10205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = sub->down_; 10215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sub->op_ == op) { 10225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** sub_subs = sub->sub(); 10235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int k = sub->nsub_ - 1; k >= 0; k--) 10245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs[--i] = sub_subs[k]->Incref(); 10255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub->Decref(); 10265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 10275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs[--i] = FinishRegexp(sub); 10285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true); 10325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] subs; 10335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = re->ComputeSimple(); 10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->down_ = next; 10355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = re; 10365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Finishes the current concatenation, 10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// collapsing it into a single regexp on the stack. 10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Regexp::ParseState::DoConcatenation() { 10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r1 = stacktop_; 10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r1 == NULL || IsMarker(r1->op())) { 10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // empty concatenation is special case 10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); 10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PushRegexp(re); 10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoCollapse(kRegexpConcat); 10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Finishes the current alternation, 10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// collapsing it to a single regexp on the stack. 10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Regexp::ParseState::DoAlternation() { 10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoVerticalBar(); 10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now stack top is kVerticalBar. 10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* r1 = stacktop_; 10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = r1->down_; 10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) r1->Decref(); 10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DoCollapse(kRegexpAlternate); 10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Incremental conversion of concatenated literals into strings. 10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If top two elements on stack are both literal or string, 10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// collapse into single string. 10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Don't walk down the stack -- the parser calls this frequently 10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// enough that below the bottom two is known to be collapsed. 10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only called when another regexp is about to be pushed 10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// on the stack, so that the topmost literal is not being considered. 10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (Otherwise ab* would turn into (ab)*.) 10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If r >= 0, consider pushing a literal r on the stack. 10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return whether that happened. 10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { 10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re1; 10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re2; 10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) 10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) 10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) 10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) 10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re2->op_ == kRegexpLiteral) { 10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // convert into string 10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune rune = re2->rune_; 10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->op_ = kRegexpLiteralString; 10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->nrunes_ = 0; 10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->runes_ = NULL; 10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->AddRuneToString(rune); 10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // push re1 into re2. 10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re1->op_ == kRegexpLiteral) { 10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->AddRuneToString(re1->rune_); 10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < re1->nrunes_; i++) 10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re2->AddRuneToString(re1->runes_[i]); 10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->nrunes_ = 0; 11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] re1->runes_; 11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->runes_ = NULL; 11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // reuse re1 if possible 11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r >= 0) { 11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->op_ = kRegexpLiteral; 11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->rune_ = r; 11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->parse_flags_ = flags; 11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stacktop_ = re2; 11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re1->Decref(); 11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Lexing routines. 11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a decimal integer, storing it in *n. 11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *out_re to the regexp for the class. 11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool ParseInteger(StringPiece* s, int* np) { 11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) 11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Disallow leading zeros. 11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) 11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = 0; 11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c; 11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { 11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Avoid overflow. 11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n >= 100000000) 11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = n*10 + c - '0'; 11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // digit 11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *np = n; 11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a repetition suffix like {1,2} or {2} or {2,}. 11425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string on success. 11435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *lo and *hi to the given range. 11445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In the case of {2,}, the high number is unbounded; 11455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sets *hi to -1 to signify this. 11465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// {,2} is NOT a valid suffix. 11475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Maybe in the name signifies that the regexp parse 11485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// doesn't fail even if ParseRepetition does, so the StringPiece 11495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// s must NOT be edited unless MaybeParseRepetition returns true. 11505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { 11515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece s = *sp; 11525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s.size() == 0 || s[0] != '{') 11535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s.remove_prefix(1); // '{' 11555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseInteger(&s, lo)) 11565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s.size() == 0) 11585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s[0] == ',') { 11605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s.remove_prefix(1); // ',' 11615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s.size() == 0) 11625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s[0] == '}') { 11645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // {2,} means at least 2 11655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *hi = -1; 11665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 11675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // {2,4} means 2, 3, or 4. 11685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseInteger(&s, hi)) 11695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 11725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // {2} means exactly two 11735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *hi = *lo; 11745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s.size() == 0 || s[0] != '}') 11765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 11775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s.remove_prefix(1); // '}' 11785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *sp = s; 11795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 11805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Removes the next Rune from the StringPiece and stores it in *r. 11835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns number of bytes removed from sp. 11845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Behaves as though there is a terminating NUL at the end of sp. 11855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Argument order is backwards from usual Google style 11865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but consistent with chartorune. 11875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { 11885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n; 11895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (fullrune(sp->data(), sp->size())) { 11905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = chartorune(r, sp->data()); 11915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(n == 1 && *r == Runeerror)) { // no decoding error 11925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sp->remove_prefix(n); 11935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return n; 11945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadUTF8); 11985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(NULL); 11995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return -1; 12005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return whether name is valid UTF-8. 12035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If not, set status to kRegexpBadUTF8. 12045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { 12055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece t = s; 12065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 12075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (t.size() > 0) { 12085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&r, &t, status) < 0) 12095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 12105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 12125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is c a hex digit? 12155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int IsHex(int c) { 12165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ('0' <= c && c <= '9') || 12175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ('A' <= c && c <= 'F') || 12185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ('a' <= c && c <= 'f'); 12195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Convert hex digit to value. 12225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int UnHex(int c) { 12235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ('0' <= c && c <= '9') 12245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return c - '0'; 12255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ('A' <= c && c <= 'F') 12265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return c - 'A' + 10; 12275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ('a' <= c && c <= 'f') 12285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return c - 'a' + 10; 12295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Bad hex digit " << c; 12305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 12315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parse an escape sequence (e.g., \n, \{). 12345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 12355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *rp to the named character. 12365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool ParseEscape(StringPiece* s, Rune* rp, 12375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status, int rune_max) { 12385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* begin = s->begin(); 12395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() < 1 || (*s)[0] != '\\') { 12405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Should not happen - caller always checks. 12415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpInternalError); 12425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(NULL); 12435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 12445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() < 2) { 12465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpTrailingBackslash); 12475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(NULL); 12485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 12495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune c, c1; 12515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // backslash 12525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c, s, status) < 0) 12535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 12545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int code; 12555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (c) { 12565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 12575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c < Runeself && !isalpha(c) && !isdigit(c)) { 12585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Escaped non-word characters are always themselves. 12595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // PCRE is not quite so rigorous: it accepts things like 12605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // \q, but we don't. We once rejected \_, but too many 12615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // programs and people insist on using it, so allow \_. 12625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = c; 12635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 12645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 12665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Octal escapes. 12685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '1': 12695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '2': 12705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '3': 12715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '4': 12725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '5': 12735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '6': 12745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '7': 12755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Single non-zero octal digit is a backreference; not supported. 12765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') 12775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 12785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall through 12795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '0': 12805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // consume up to three octal digits; already have one. 12815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code = c - '0'; 12825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { 12835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code = code * 8 + c - '0'; 12845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // digit 12855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() > 0) { 12865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = (*s)[0]; 12875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ('0' <= c && c <= '7') { 12885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code = code * 8 + c - '0'; 12895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // digit 12905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = code; 12945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 12955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Hexadecimal escapes 12975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'x': 12985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0) 12995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c, s, status) < 0) 13015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 13025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c == '{') { 13035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Any number of digits in braces. 13045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Update n as we consume the string, so that 13055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the whole thing gets shown in the error message. 13065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Perl accepts any text at all; it ignores all text 13075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // after the first non-hex digit. We require only hex digits, 13085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and at least one. 13095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c, s, status) < 0) 13105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 13115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nhex = 0; 13125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code = 0; 13135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (IsHex(c)) { 13145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nhex++; 13155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code = code * 16 + UnHex(c); 13165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (code > rune_max) 13175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0) 13195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c, s, status) < 0) 13215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 13225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c != '}' || nhex == 0) 13245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = code; 13265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Easy case: two hex digits. 13295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0) 13305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c1, s, status) < 0) 13325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 13335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsHex(c) || !IsHex(c1)) 13345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadEscape; 13355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = UnHex(c) * 16 + UnHex(c1); 13365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // C escapes. 13395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'n': 13405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\n'; 13415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'r': 13435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\r'; 13445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 't': 13465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\t'; 13475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Less common C escapes. 13505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'a': 13515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\a'; 13525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'f': 13545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\f'; 13555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'v': 13575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *rp = '\v'; 13585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This code is disabled to avoid misparsing 13615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the Perl word-boundary \b as a backspace 13625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // when in POSIX regexp mode. Surprisingly, 13635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in Perl, \b means word-boundary but [\b] 13645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // means backspace. We don't support that: 13655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if you want a backspace embed a literal 13665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // backspace character or use \x08. 13675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 13685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // case 'b': 13695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // *rp = '\b'; 13705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // return true; 13715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Not reached in ParseEscape."; 13745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BadEscape: 13765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Unrecognized escape sequence. 13775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadEscape); 13785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(StringPiece(begin, s->data() - begin)); 13795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 13805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 13815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Add a range to the character class, but exclude newline if asked. 13835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Also handle case folding. 13845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void CharClassBuilder::AddRangeFlags( 13855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { 13865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Take out \n if the flags say so. 13885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool cutnl = !(parse_flags & Regexp::ClassNL) || 13895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (parse_flags & Regexp::NeverNL); 13905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cutnl && lo <= '\n' && '\n' <= hi) { 13915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo < '\n') 13925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRangeFlags(lo, '\n' - 1, parse_flags); 13935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi > '\n') 13945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRangeFlags('\n' + 1, hi, parse_flags); 13955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 13965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If folding case, add fold-equivalent characters too. 13995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (parse_flags & Regexp::FoldCase) 14005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddFoldedRange(this, lo, hi, 0); 14015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 14025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRange(lo, hi); 14035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Look for a group with the given name. 14065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static UGroup* LookupGroup(const StringPiece& name, 14075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *groups, int ngroups) { 14085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Simple name lookup. 14095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < ngroups; i++) 14105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPiece(groups[i].name) == name) 14115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return &groups[i]; 14125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 14135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fake UGroup containing all Runes 14165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static URange16 any16[] = { { 0, 65535 } }; 14175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static URange32 any32[] = { { 65536, Runemax } }; 14185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; 14195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Look for a POSIX group with the given name (e.g., "[:^alpha:]") 14215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static UGroup* LookupPosixGroup(const StringPiece& name) { 14225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return LookupGroup(name, posix_groups, num_posix_groups); 14235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static UGroup* LookupPerlGroup(const StringPiece& name) { 14265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return LookupGroup(name, perl_groups, num_perl_groups); 14275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Look for a Unicode group with the given name (e.g., "Han") 14305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static UGroup* LookupUnicodeGroup(const StringPiece& name) { 14315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: "Any" means any. 14325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (name == StringPiece("Any")) 14335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return &anygroup; 14345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return LookupGroup(name, unicode_groups, num_unicode_groups); 14355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Add a UGroup or its negation to the character class. 14385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, 14395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags parse_flags) { 14405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sign == +1) { 14415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < g->nr16; i++) { 14425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); 14435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < g->nr32; i++) { 14455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); 14465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 14485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (parse_flags & Regexp::FoldCase) { 14495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Normally adding a case-folded group means 14505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // adding all the extra fold-equivalent runes too. 14515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // But if we're adding the negation of the group, 14525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we have to exclude all the runes that are fold-equivalent 14535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to what's already missing. Too hard, so do in two steps. 14545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClassBuilder ccb1; 14555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddUGroup(&ccb1, g, +1, parse_flags); 14565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the flags say to take out \n, put it in, so that negating will take it out. 14575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. 14585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool cutnl = !(parse_flags & Regexp::ClassNL) || 14595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (parse_flags & Regexp::NeverNL); 14605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cutnl) { 14615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ccb1.AddRange('\n', '\n'); 14625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ccb1.Negate(); 14645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddCharClass(&ccb1); 14655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 14665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next = 0; 14685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < g->nr16; i++) { 14695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next < g->r16[i].lo) 14705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); 14715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = g->r16[i].hi + 1; 14725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < g->nr32; i++) { 14745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next < g->r32[i].lo) 14755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); 14765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = g->r32[i].hi + 1; 14775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next <= Runemax) 14795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cc->AddRangeFlags(next, Runemax, parse_flags); 14805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 14825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Maybe parse a Perl character class escape sequence. 14845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only recognizes the Perl character classes (\d \s \w \D \S \W), 14855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// not the Perl empty-string classes (\b \B \A \Z \z). 14865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On success, sets *s to span the remainder of the string 14875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and returns the corresponding UGroup. 14885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The StringPiece must *NOT* be edited unless the call succeeds. 14895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { 14905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(parse_flags & Regexp::PerlClasses)) 14915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 14925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() < 2 || (*s)[0] != '\\') 14935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 14945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Could use StringPieceToRune, but there aren't 14955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // any non-ASCII Perl group names. 14965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece name(s->begin(), 2); 14975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *g = LookupPerlGroup(name); 14985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (g == NULL) 14995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 15005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(name.size()); 15015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return g; 15025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum ParseStatus { 15055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kParseOk, // Did some parsing. 15065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kParseError, // Found an error. 15075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kParseNothing, // Decided not to parse. 15085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 15095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Maybe parses a Unicode character group like \p{Han} or \P{Han} 15115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (the latter is a negated group). 15125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, 15135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClassBuilder *cc, 15145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 15155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Decide whether to parse. 15165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(parse_flags & Regexp::UnicodeGroups)) 15175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseNothing; 15185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() < 2 || (*s)[0] != '\\') 15195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseNothing; 15205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune c = (*s)[1]; 15215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c != 'p' && c != 'P') 15225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseNothing; 15235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Committed to parse. Results: 15255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int sign = +1; // -1 = negated char class 15265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c == 'P') 15275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sign = -1; 15285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece seq = *s; // \p{Han} or \pL 15295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece name; // Han or L 15305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(2); // '\\', 'p' 15315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!StringPieceToRune(&c, s, status)) 15335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 15345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c != '{') { 15355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Name is the bit of string we just skipped over for c. 15365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* p = seq.begin() + 2; 15375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) name = StringPiece(p, s->begin() - p); 15385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 15395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Name is in braces. Look for closing } 15405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int end = s->find('}', 0); 15415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (end == s->npos) { 15425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidUTF8(seq, status)) 15435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 15445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadCharRange); 15455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(seq); 15465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 15475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) name = StringPiece(s->begin(), end); // without '}' 15495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(end + 1); // with '}' 15505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidUTF8(name, status)) 15515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 15525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Chop seq where s now begins. 15555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) seq = StringPiece(seq.begin(), s->begin() - seq.begin()); 15565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look up group 15585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (name.size() > 0 && name[0] == '^') { 15595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sign = -sign; 15605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) name.remove_prefix(1); // '^' 15615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *g = LookupUnicodeGroup(name); 15635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (g == NULL) { 15645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadCharRange); 15655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(seq); 15665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 15675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddUGroup(cc, g, sign, parse_flags); 15705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseOk; 15715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a character class name like [:alnum:]. 15745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 15755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Adds the ranges corresponding to the class to ranges. 15765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, 15775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClassBuilder *cc, 15785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 15795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check begins with [: 15805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* p = s->data(); 15815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ep = s->data() + s->size(); 15825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ep - p < 2 || p[0] != '[' || p[1] != ':') 15835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseNothing; 15845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for closing :]. 15865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* q; 15875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) 15885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; 15895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If no closing :], then ignore. 15915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q > ep-2) 15925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseNothing; 15935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Got it. Check that it's valid. 15955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q += 2; 15965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece name(p, q-p); 15975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *g = LookupPosixGroup(name); 15995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (g == NULL) { 16005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadCharRange); 16015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(name); 16025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseError; 16035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(name.size()); 16065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddUGroup(cc, g, g->sign, parse_flags); 16075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kParseOk; 16085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 16095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a character inside a character class. 16115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// There are fewer special characters here than in the rest of the regexp. 16125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 16135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *rp to the character. 16145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, 16155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& whole_class, 16165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 16175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0) { 16185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpMissingBracket); 16195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(whole_class); 16205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Allow regular escape sequences even though 16245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // many need not be escaped in this context. 16255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() >= 1 && (*s)[0] == '\\') 16265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ParseEscape(s, rp, status, rune_max_); 16275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Otherwise take the next rune. 16295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return StringPieceToRune(rp, s, status) >= 0; 16305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 16315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a character class character, or, if the character 16335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is followed by a hyphen, parses a character class range. 16345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For single characters, rr->lo == rr->hi. 16355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 16365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *rp to the character. 16375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, 16385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& whole_class, 16395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 16405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece os = *s; 16415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) 16425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // [a-] means (a|-), so check for final ]. 16445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { 16455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // '-' 16465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) 16475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (rr->hi < rr->lo) { 16495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadCharRange); 16505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); 16515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 16545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rr->hi = rr->lo; 16555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 16575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 16585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. 16605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *s to span the remainder of the string. 16615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets *out_re to the regexp for the class. 16625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::ParseCharClass(StringPiece* s, 16635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** out_re, 16645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 16655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece whole_class = *s; 16665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0 || (*s)[0] != '[') { 16675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller checked this. 16685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpInternalError); 16695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(NULL); 16705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool negated = false; 16735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 16745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = new CharClassBuilder; 16755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // '[' 16765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() > 0 && (*s)[0] == '^') { 16775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // '^' 16785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) negated = true; 16795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { 16805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If NL can't match implicitly, then pretend 16815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // negated classes include a leading \n. 16825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->AddRange('\n', '\n'); 16835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool first = true; // ] is okay as first char in class 16865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (s->size() > 0 && ((*s)[0] != ']' || first)) { 16875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // - is only okay unescaped as first or last in class. 16885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Except that Perl allows - anywhere. 16895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((*s)[0] == '-' && !first && !(flags_&PerlX) && 16905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (s->size() == 1 || (*s)[1] != ']')) { 16915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece t = *s; 16925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '-' 16935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 16945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = StringPieceToRune(&r, &t, status); 16955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n < 0) { 16965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 16975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpBadCharRange); 17005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(StringPiece(s->data(), 1+n)); 17015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 17025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) first = false; 17055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for [:alnum:] etc. 17075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { 17085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ParseCCName(s, flags_, re->ccb_, status)) { 17095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseOk: 17105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 17115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseError: 17125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 17135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseNothing: 17155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 17165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for Unicode character group like \p{Han} 17205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() > 2 && 17215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*s)[0] == '\\' && 17225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ((*s)[1] == 'p' || (*s)[1] == 'P')) { 17235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { 17245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseOk: 17255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 17265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseError: 17275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 17285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseNothing: 17305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 17315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for Perl character class symbols (extension). 17355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *g = MaybeParsePerlCCEscape(s, flags_); 17365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (g != NULL) { 17375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddUGroup(re->ccb_, g, g->sign, flags_); 17385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 17395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Otherwise assume single character or simple range. 17425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RuneRange rr; 17435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseCCRange(s, &rr, whole_class, status)) { 17445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 17455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AddRangeFlags is usually called in response to a class like 17485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // \p{Foo} or [[:foo:]]; for those, it filters \n out unless 17495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Regexp::ClassNL is set. In an explicit range or singleton 17505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // like we just parsed, we do not filter \n out, so set ClassNL 17515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the flags. 17525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); 17535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->size() == 0) { 17555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpMissingBracket); 17565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(whole_class); 17575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 17585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(1); // ']' 17615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 17635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->Negate(); 17645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_->RemoveAbove(rune_max_); 17655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *out_re = re; 17675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 17695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is this a valid capture name? [A-Za-z0-9_]+ 17715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PCRE limits names to 32 bytes. 17725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Python rejects names starting with digits. 17735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We don't enforce either of those. 17745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsValidCaptureName(const StringPiece& name) { 17755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (name.size() == 0) 17765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < name.size(); i++) { 17785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c = name[i]; 17795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (('0' <= c && c <= '9') || 17805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ('a' <= c && c <= 'z') || 17815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ('A' <= c && c <= 'Z') || 17825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c == '_') 17835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 17845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 17885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses a Perl flag setting or non-capturing group or both, 17905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// like (?i) or (?: or (?i:. Removes from s, updates parse state. 17915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The caller must check that s begins with "(?". 17925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true on success. If the Perl flag is not 17935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// well-formed or not supported, sets status_ and returns false. 17945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { 17955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece t = *s; 17965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller is supposed to check this. 17985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { 17995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; 18005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpInternalError); 18015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // "(?" 18055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check for named captures, first introduced in Python's regexp library. 18075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // As usual, there are three slightly different syntaxes: 18085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 18095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (?P<name>expr) the original, introduced by Python 18105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (?<name>expr) the .NET alteration, adopted by Perl 5.10 18115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (?'name'expr) another .NET alteration, adopted by Perl 5.10 18125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 18135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Perl 5.10 gave in and implemented the Python version too, 18145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // but they claim that the last two are the preferred forms. 18155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // PCRE and languages based on it (specifically, PHP and Ruby) 18165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // support all three as well. EcmaScript 4 uses only the Python form. 18175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 18185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In both the open source world (via Code Search) and the 18195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Google source tree, (?P<expr>name) is the dominant form, 18205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so that's the one we implement. One is enough. 18215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { 18225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pull out name. 18235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int end = t.find('>', 2); 18245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (end == t.npos) { 18255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidUTF8(*s, status_)) 18265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpBadNamedCapture); 18285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(*s); 18295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // t is "P<name>...", t[end] == '>' 18335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece capture(t.begin()-2, end+3); // "(?P<name>" 18345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece name(t.begin()+2, end-2); // "name" 18355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidUTF8(name, status_)) 18365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidCaptureName(name)) { 18385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpBadNamedCapture); 18395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(capture); 18405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!DoLeftParen(name)) { 18445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // DoLeftParen's failure set status_. 18455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->remove_prefix(capture.end() - s->begin()); 18495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 18505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool negated = false; 18535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool sawflags = false; 18545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nflags = flags_; 18555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune c; 18565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (bool done = false; !done; ) { 18575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() == 0) 18585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadPerlOp; 18595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&c, &t, status_) < 0) 18605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (c) { 18625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 18635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadPerlOp; 18645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse flags. 18665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'i': 18675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawflags = true; 18685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 18695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags &= ~FoldCase; 18705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 18715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags |= FoldCase; 18725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 18735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'm': // opposite of our OneLine 18755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawflags = true; 18765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 18775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags |= OneLine; 18785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 18795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags &= ~OneLine; 18805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 18815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 's': 18835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawflags = true; 18845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 18855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags &= ~DotNL; 18865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 18875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags |= DotNL; 18885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 18895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 'U': 18915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawflags = true; 18925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 18935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags &= ~NonGreedy; 18945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 18955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nflags |= NonGreedy; 18965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 18975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Negation 18995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '-': 19005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated) 19015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadPerlOp; 19025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) negated = true; 19035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawflags = false; 19045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 19055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Open new group. 19075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case ':': 19085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!DoLeftParenNoCapture()) { 19095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // DoLeftParenNoCapture's failure set status_. 19105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done = true; 19135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 19145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finish flags. 19165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case ')': 19175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) done = true; 19185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 19195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (negated && !sawflags) 19235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto BadPerlOp; 19245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags_ = static_cast<Regexp::ParseFlags>(nflags); 19265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *s = t; 19275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 19285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BadPerlOp: 19305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_code(kRegexpBadPerlOp); 19315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); 19325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 19345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts latin1 (assumed to be encoded as Latin1 bytes) 19365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// into UTF8 encoding in string. 19375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is 19385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// deprecated and because it rejects code points 0x80-0x9F. 19395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { 19405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char buf[UTFmax]; 19415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf->clear(); 19435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < latin1.size(); i++) { 19445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r = latin1[i] & 0xFF; 19455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = runetochar(buf, &r); 19465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf->append(buf, n); 19475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 19495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses the regular expression given by s, 19515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// returning the corresponding Regexp tree. 19525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The caller must Decref the return value when done with it. 19535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns NULL on error. 19545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, 19555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 19565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make status non-NULL (easier on everyone else). 19575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus xstatus; 19585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (status == NULL) 19595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status = &xstatus; 19605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ParseState ps(global_flags, s, status); 19625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece t = s; 19635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Convert regexp to UTF-8 (easier on the rest of the parser). 19655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (global_flags & Latin1) { 19665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string* tmp = new string; 19675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ConvertLatin1ToUTF8(t, tmp); 19685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_tmp(tmp); 19695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t = *tmp; 19705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (global_flags & Literal) { 19735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special parse loop for literal string. 19745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (t.size() > 0) { 19755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 19765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&r, &t, status) < 0) 19775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 19785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushLiteral(r)) 19795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 19805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ps.DoFinish(); 19825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece lastunary = NULL; 19855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (t.size() > 0) { 19865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece isunary = NULL; 19875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (t[0]) { 19885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: { 19895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 19905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&r, &t, status) < 0) 19915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 19925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushLiteral(r)) 19935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 19945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 19955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '(': 19985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "(?" introduces Perl escape. 19995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { 20005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Flag changes and non-capturing groups. 20015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.ParsePerlFlags(&t)) 20025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ps.flags() & NeverCapture) { 20065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.DoLeftParenNoCapture()) 20075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 20095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.DoLeftParen(NULL)) 20105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '(' 20135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '|': 20165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.DoVerticalBar()) 20175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '|' 20195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case ')': 20225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.DoRightParen()) 20235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // ')' 20255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '^': // Beginning of line. 20285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushCarat()) 20295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '^' 20315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '$': // End of line. 20345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushDollar()) 20355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '$' 20375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '.': // Any character (possibly except newline). 20405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushDot()) 20415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '.' 20435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '[': { // Character class. 20465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re; 20475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.ParseCharClass(&t, &re, status)) 20485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushRegexp(re)) 20505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '*': { // Zero or more. 20555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpOp op; 20565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) op = kRegexpStar; 20575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto Rep; 20585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '+': // One or more. 20595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) op = kRegexpPlus; 20605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto Rep; 20615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '?': // Zero or one. 20625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) op = kRegexpQuest; 20635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto Rep; 20645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rep: 20655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece opstr = t; 20665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool nongreedy = false; 20675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '*' or '+' or '?' 20685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ps.flags() & PerlX) { 20695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() > 0 && t[0] == '?') { 20705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nongreedy = true; 20715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '?' 20725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lastunary.size() > 0) { 20745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In Perl it is not allowed to stack repetition operators: 20755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a** is a syntax error, not a double-star. 20765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (and a++ means something else entirely, which we don't support!) 20775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpRepeatOp); 20785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(StringPiece(lastunary.begin(), 20795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.begin() - lastunary.begin())); 20805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) opstr.set(opstr.data(), t.data() - opstr.data()); 20845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushRepeatOp(op, opstr, nongreedy)) 20855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) isunary = opstr; 20875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '{': { // Counted repetition. 20915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int lo, hi; 20925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece opstr = t; 20935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!MaybeParseRepetition(&t, &lo, &hi)) { 20945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Treat like a literal. 20955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushLiteral('{')) 20965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 20975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '{' 20985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool nongreedy = false; 21015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ps.flags() & PerlX) { 21025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() > 0 && t[0] == '?') { 21035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nongreedy = true; 21045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(1); // '?' 21055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lastunary.size() > 0) { 21075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Not allowed to stack repetition operators. 21085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpRepeatOp); 21095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(StringPiece(lastunary.begin(), 21105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.begin() - lastunary.begin())); 21115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) opstr.set(opstr.data(), t.data() - opstr.data()); 21155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) 21165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) isunary = opstr; 21185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case '\\': { // Escaped character or Perl sequence. 21225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // \b and \B: word boundary or not 21235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((ps.flags() & Regexp::PerlB) && 21245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { 21255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushWordBoundary(t[1] == 'b')) 21265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'b' 21285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { 21325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t[1] == 'A') { 21335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushSimpleOp(kRegexpBeginText)) 21345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'A' 21365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t[1] == 'z') { 21395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushSimpleOp(kRegexpEndText)) 21405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'z' 21425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do not recognize \Z, because this library can't 21455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // implement the exact Perl/PCRE semantics. 21465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (This library treats "(?-m)$" as \z, even though 21475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in Perl and PCRE it is equivalent to \Z.) 21485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t[1] == 'C') { // \C: any byte [sic] 21505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushSimpleOp(kRegexpAnyByte)) 21515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'C' 21535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t[1] == 'Q') { // \Q ... \E: the ... is always literals 21575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'Q' 21585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (t.size() > 0) { 21595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { 21605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t.remove_prefix(2); // '\\', 'E' 21615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 21645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (StringPieceToRune(&r, &t, status) < 0) 21655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushLiteral(r)) 21675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { 21745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 21755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = new CharClassBuilder; 21765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { 21775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseOk: 21785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushRegexp(re)) 21795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto Break2; 21815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseError: 21825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 21835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kParseNothing: 21855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 21865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); 21915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (g != NULL) { 21925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 21935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->ccb_ = new CharClassBuilder; 21945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddUGroup(re->ccb_, g, g->sign, ps.flags()); 21955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushRegexp(re)) 21965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 21975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 21985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 22005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune r; 22015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ParseEscape(&t, &r, status, ps.rune_max())) 22025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 22035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ps.PushLiteral(r)) 22045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 22055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 22065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 22075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 22085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Break2: 22095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastunary = isunary; 22105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 22115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ps.DoFinish(); 22125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 22135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 22145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 2215