12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2006 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regular expression parser. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The parser is a simple precedence-based parser with a 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// manual stack. The parsing work is done by the methods 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of the ParseState class. The Regexp::Parse function is 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// essentially just a lexer that calls the ParseState method 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// for each token. 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The parser recognizes POSIX extended regular expressions 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// excluding backreferences, collating elements, and collating 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// classes. It also allows the empty string as a regular expression 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See regexp.h for rationale. 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h" 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h" 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/stringpiece.h" 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/unicode_casefold.h" 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/unicode_groups.h" 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regular expression parse state. 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The list of parsed regexps so far is maintained as a vector of 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regexp pointers called the stack. Left parenthesis and vertical 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// bar markers are also placed on the stack, as Regexps with 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// non-standard opcodes. 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Scanning a left parenthesis causes the parser to push a left parenthesis 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// marker on the stack. 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Scanning a vertical bar causes the parser to pop the stack until it finds a 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// vertical bar or left parenthesis marker (not popping the marker), 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// concatenate all the popped results, and push them back on 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the stack (DoConcatenation). 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Scanning a right parenthesis causes the parser to act as though it 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// has seen a vertical bar, which then leaves the top of the stack in the 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The parser pops all this off the stack and creates an alternation of the 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// regexps (DoAlternation). 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Regexp::ParseState { 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ParseState(ParseFlags flags, const StringPiece& whole_regexp, 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status); 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~ParseState(); 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ParseFlags flags() { return flags_; } 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int rune_max() { return rune_max_; } 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse methods. All public methods return a bool saying 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // whether parsing should continue. If a method returns 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // false, it has set fields in *status_, and the parser 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // should return NULL. 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes the given regular expression onto the stack. 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Could check for too much memory used here. 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushRegexp(Regexp* re); 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes the literal rune r onto the stack. 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushLiteral(Rune r); 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a regexp with the given op (and no args) onto the stack. 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushSimpleOp(RegexpOp op); 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a ^ onto the stack. 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushCarat(); 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a \b (word == true) or \B (word == false) onto the stack. 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushWordBoundary(bool word); 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a $ onto the stack. 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushDollar(); 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a . onto the stack 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushDot(); 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a repeat operator regexp onto the stack. 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // A valid argument for the operator must already be on the stack. 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // s is the name of the operator, for use in error messages. 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a repetition regexp onto the stack. 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // A valid argument for the operator must already be on the stack. 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Checks whether a particular regexp op is a marker. 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool IsMarker(RegexpOp op); 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Processes a left parenthesis in the input. 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushes a marker onto the stack. 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool DoLeftParen(const StringPiece& name); 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool DoLeftParenNoCapture(); 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Processes a vertical bar in the input. 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool DoVerticalBar(); 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Processes a right parenthesis in the input. 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool DoRightParen(); 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Processes the end of input, returning the final regexp. 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* DoFinish(); 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finishes the regexp if necessary, preparing it for use 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in a more complicated expression. 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If it is a CharClassBuilder, converts into a CharClass. 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* FinishRegexp(Regexp*); 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // These routines don't manipulate the parse stack 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // directly, but they do need to look at flags_. 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ParseCharClass also manipulates the internals of Regexp 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // while creating *out_re. 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse a character class into *out_re. 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Removes parsed text from s. 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ParseCharClass(StringPiece* s, Regexp** out_re, 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status); 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse a character class character into *rp. 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Removes parsed text from s. 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ParseCCCharacter(StringPiece* s, Rune *rp, 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& whole_class, 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status); 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse a character class range into rr. 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Removes parsed text from s. 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ParseCCRange(StringPiece* s, RuneRange* rr, 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& whole_class, 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status); 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse a Perl flag set or non-capturing group from s. 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ParsePerlFlags(StringPiece* s); 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finishes the current concatenation, 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // collapsing it into a single regexp on the stack. 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void DoConcatenation(); 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finishes the current alternation, 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // collapsing it to a single regexp on the stack. 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void DoAlternation(); 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Generalized DoAlternation/DoConcatenation. 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void DoCollapse(RegexpOp op); 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Maybe concatenate Literals into LiteralString. 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool MaybeConcatString(int r, ParseFlags flags); 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonprivate: 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ParseFlags flags_; 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece whole_regexp_; 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status_; 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* stacktop_; 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ncap_; // number of capturing parens seen 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int rune_max_; // maximum char value for this encoding 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(ParseState); 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pseudo-operators - only on parse stack. 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonconst RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonconst RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp::ParseState::ParseState(ParseFlags flags, 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& whole_regexp, 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : flags_(flags), whole_regexp_(whole_regexp), 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_(status), stacktop_(NULL), ncap_(0) { 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (flags_ & Latin1) 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_max_ = 0xFF; 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_max_ = Runemax; 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Cleans up by freeing all the regexps on the stack. 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp::ParseState::~ParseState() { 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* next; 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Regexp* re = stacktop_; re != NULL; re = next) { 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = re->down_; 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = NULL; 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kLeftParen) 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete re->name_; 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Finishes the regexp if necessary, preparing it for use in 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// a more complex expression. 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If it is a CharClassBuilder, converts into a CharClass. 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp* Regexp::ParseState::FinishRegexp(Regexp* re) { 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re == NULL) 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = NULL; 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClassBuilder* ccb = re->ccb_; 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = NULL; 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->cc_ = ccb->GetCharClass(); 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete ccb; 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re; 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes the given regular expression onto the stack. 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Could check for too much memory used here. 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushRegexp(Regexp* re) { 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MaybeConcatString(-1, NoParseFlags); 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Special case: a character class of one character is just 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a literal. This is a common idiom for escaping 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // single characters (e.g., [.] instead of \.), and some 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // analysis does better with fewer character classes. 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op_ == kRegexpCharClass) { 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->ccb_->size() == 1) { 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r = re->ccb_->begin()->lo; 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = new Regexp(kRegexpLiteral, flags_); 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->rune_ = r; 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (re->ccb_->size() == 2) { 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r = re->ccb_->begin()->lo; 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = new Regexp(kRegexpLiteral, flags_ | FoldCase); 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->rune_ = r + 'a' - 'A'; 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsMarker(re->op())) 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->simple_ = re->ComputeSimple(); 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = stacktop_; 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = re; 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Searches the case folding tables and returns the CaseFold* that contains r. 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If there isn't one, returns NULL. 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonCaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CaseFold* ef = f + n; 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Binary search for entry containing r. 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (n > 0) { 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int m = n/2; 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f[m].lo <= r && r <= f[m].hi) 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return &f[m]; 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r < f[m].lo) { 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = m; 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f += m+1; 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n -= m+1; 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // There is no entry that contains r, but f points 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // where it would have been. Unless f points at 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the end of the array, it points at the next entry 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // after r. 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f < ef) 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // No entry contains r; no entry contains runes > r. 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the result of applying the fold f to the rune r. 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRune ApplyFold(CaseFold *f, Rune r) { 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (f->delta) { 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r + f->delta; 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case EvenOddSkip: // even <-> odd but only applies to every other 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((r - f->lo) % 2) 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r; 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case EvenOdd: // even <-> odd 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r%2 == 0) 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r + 1; 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r - 1; 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case OddEvenSkip: // odd <-> even but only applies to every other 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((r - f->lo) % 2) 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r; 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case OddEven: // odd <-> even 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r%2 == 1) 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r + 1; 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r - 1; 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the next Rune in r's folding cycle (see unicode_casefold.h). 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Examples: 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune('A') = 'a' 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune('a') = 'A' 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune('K') = 'k' 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune('k') = 0x212A (Kelvin) 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune(0x212A) = 'K' 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// CycleFoldRune('?') = '?' 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRune CycleFoldRune(Rune r) { 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f == NULL || r < f->lo) 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return r; 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ApplyFold(f, r); 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Add lo-hi to the class, along with their fold-equivalent characters. 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If lo-hi is already in the class, assume that the fold-equivalent 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// chars are there too, so there's no work to do. 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // AddFoldedRange calls itself recursively for each rune in the fold cycle. 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Most folding cycles are small: there aren't any bigger than four in the 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // current Unicode tables. make_unicode_casefold.py checks that 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the cycles are not too long, and we double-check here using depth. 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (depth > 10) { 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "AddFoldedRange recurses too much."; 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (lo <= hi) { 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f == NULL) // lo has no fold, nor does anything above lo 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo = f->lo; 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Add in the result of folding the range lo - f->hi 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and that range's fold, recursively. 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune lo1 = lo; 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune hi1 = min<Rune>(hi, f->hi); 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (f->delta) { 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo1 += f->delta; 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi1 += f->delta; 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case EvenOdd: 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo1%2 == 1) 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo1--; 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi1%2 == 0) 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi1++; 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case OddEven: 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo1%2 == 0) 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo1--; 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi1%2 == 1) 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi1++; 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddFoldedRange(cc, lo1, hi1, depth+1); 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pick up where this fold left off. 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo = f->hi + 1; 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes the literal rune r onto the stack. 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushLiteral(Rune r) { 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Do case folding if needed. 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = new CharClassBuilder; 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r1 = r; 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson do { 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(flags_ & NeverNL) || r != '\n') { 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->AddRange(r, r); 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r = CycleFoldRune(r); 3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } while (r != r1); 3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->RemoveAbove(rune_max_); 3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Exclude newline if applicable. 3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((flags_ & NeverNL) && r == '\n') 3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); 3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // No fancy stuff worked. Ordinary literal. 3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (MaybeConcatString(r, flags_)) 3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpLiteral, flags_); 3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->rune_ = r; 3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a ^ onto the stack. 3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushCarat() { 3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (flags_ & OneLine) { 4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpBeginText); 4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpBeginLine); 4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a \b or \B onto the stack. 4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushWordBoundary(bool word) { 4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (word) 4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpWordBoundary); 4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpNoWordBoundary); 4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a $ onto the stack. 4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushDollar() { 4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (flags_ & OneLine) { 4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Clumsy marker so that MimicsPCRE() can tell whether 4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // this kRegexpEndText was a $ and not a \z. 4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags oflags = flags_; 4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags_ = flags_ | WasDollar; 4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ret = PushSimpleOp(kRegexpEndText); 4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags_ = oflags; 4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ret; 4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpEndLine); 4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a . onto the stack. 4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushDot() { 4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((flags_ & DotNL) && !(flags_ & NeverNL)) 4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kRegexpAnyChar); 4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Rewrite . into [^\n] 4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = new CharClassBuilder; 4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->AddRange(0, '\n' - 1); 4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->AddRange('\n' + 1, rune_max_); 4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a regexp with the given op (and no args) onto the stack. 4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushSimpleOp(RegexpOp op) { 4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(op, flags_); 4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a repeat operator regexp onto the stack. 4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A valid argument for the operator must already be on the stack. 4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The char c is the name of the operator, for use in error messages. 4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, 4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool nongreedy) { 4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpRepeatArgument); 4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(s); 4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags fl = flags_; 4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nongreedy) 4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fl = fl ^ NonGreedy; 4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(op, fl); 4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->AllocSub(1); 4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = stacktop_->down_; 4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->sub()[0] = FinishRegexp(stacktop_); 4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->simple_ = re->ComputeSimple(); 4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = re; 4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a repetition regexp onto the stack. 4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A valid argument for the operator must already be on the stack. 4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::PushRepetition(int min, int max, 4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& s, 4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool nongreedy) { 4712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((max != -1 && max < min) || min > 1000 || max > 1000) { 4722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpRepeatSize); 4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(s); 4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpRepeatArgument); 4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(s); 4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags fl = flags_; 4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nongreedy) 4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fl = fl ^ NonGreedy; 4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpRepeat, fl); 4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->min_ = min; 4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->max_ = max; 4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->AllocSub(1); 4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = stacktop_->down_; 4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->sub()[0] = FinishRegexp(stacktop_); 4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->simple_ = re->ComputeSimple(); 4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = re; 4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Checks whether a particular regexp op is a marker. 4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::IsMarker(RegexpOp op) { 4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return op >= kLeftParen; 4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes a left parenthesis in the input. 5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a marker onto the stack. 5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::DoLeftParen(const StringPiece& name) { 5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kLeftParen, flags_); 5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->cap_ = ++ncap_; 5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (name.data() != NULL) 5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->name_ = new string(name.as_string()); 5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Pushes a non-capturing marker onto the stack. 5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::DoLeftParenNoCapture() { 5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kLeftParen, flags_); 5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->cap_ = -1; 5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Adds r to cc, along with r's upper case if foldascii is set. 5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { 5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRange(r, r); 5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (foldascii && 'a' <= r && r <= 'z') 5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); 5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes a vertical bar in the input. 5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::DoVerticalBar() { 5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MaybeConcatString(-1, NoParseFlags); 5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoConcatenation(); 5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Below the vertical bar is a list to alternate. 5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Above the vertical bar is a list to concatenate. 5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We just did the concatenation, so either swap 5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the result below the vertical bar or push a new 5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // vertical bar on the stack. 5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r1; 5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r2; 5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((r1 = stacktop_) != NULL && 5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (r2 = stacktop_->down_) != NULL && 5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r2->op() == kVerticalBar) { 5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If above and below vertical bar are literal or char class, 5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // can merge into a single char class. 5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r3; 5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((r1->op() == kRegexpLiteral || 5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->op() == kRegexpCharClass || 5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->op() == kRegexpAnyChar) && 5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (r3 = r2->down_) != NULL) { 5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune rune; 5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (r3->op()) { 5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpLiteral: // convert to char class 5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune = r3->rune_; 5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->op_ = kRegexpCharClass; 5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->cc_ = NULL; 5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->ccb_ = new CharClassBuilder; 5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); 5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpCharClass: 5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r1->op() == kRegexpLiteral) 5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddLiteral(r3->ccb_, r1->rune_, 5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->parse_flags_ & Regexp::FoldCase); 5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (r1->op() == kRegexpCharClass) 5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->ccb_->AddCharClass(r1->ccb_); 5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { 5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete r3->ccb_; 5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->ccb_ = NULL; 5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r3->op_ = kRegexpAnyChar; 5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpAnyChar: 5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // pop r1 5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = r2; 5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->Decref(); 5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Swap r1 below vertical bar (r2). 5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->down_ = r2->down_; 5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r2->down_ = r1; 5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = r2; 5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushSimpleOp(kVerticalBar); 5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes a right parenthesis in the input. 5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::DoRightParen() { 5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finish the current concatenation and alternation. 5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoAlternation(); 5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The stack should be: LeftParen regexp 5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Remove the LeftParen, leaving the regexp, 5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // parenthesized. 5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r1; 5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r2; 5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((r1 = stacktop_) == NULL || 5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (r2 = r1->down_) == NULL || 5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r2->op() != kLeftParen) { 6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpMissingParen); 6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(whole_regexp_); 6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pop off r1, r2. Will Decref or reuse below. 6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = r2->down_; 6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Restore flags from when paren opened. 6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = r2; 6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags_ = re->parse_flags(); 6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Rewrite LeftParen as capture if needed. 6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->cap_ > 0) { 6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->op_ = kRegexpCapture; 6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // re->cap_ is already set 6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->AllocSub(1); 6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->sub()[0] = FinishRegexp(r1); 6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->simple_ = re->ComputeSimple(); 6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = r1; 6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return PushRegexp(re); 6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes the end of input, returning the final regexp. 6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp* Regexp::ParseState::DoFinish() { 6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoAlternation(); 6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = stacktop_; 6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re != NULL && re->down_ != NULL) { 6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpMissingParen); 6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(whole_regexp_); 6332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 6342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = NULL; 6362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return FinishRegexp(re); 6372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the leading regexp that re starts with. 6402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The returned Regexp* points into a piece of re, 6412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// so it must not be used after the caller calls re->Decref(). 6422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp* Regexp::LeadingRegexp(Regexp* re) { 6432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpEmptyMatch) 6442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 6452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpConcat && re->nsub() >= 2) { 6462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub = re->sub(); 6472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sub[0]->op() == kRegexpEmptyMatch) 6482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 6492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return sub[0]; 6502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re; 6522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Removes LeadingRegexp(re) from re and returns what's left. 6552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Consumes the reference to re and may edit it in place. 6562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If caller wants to hold on to LeadingRegexp(re), 6572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// must have already Incref'ed it. 6582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp* Regexp::RemoveLeadingRegexp(Regexp* re) { 6592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpEmptyMatch) 6602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re; 6612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpConcat && re->nsub() >= 2) { 6622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub = re->sub(); 6632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sub[0]->op() == kRegexpEmptyMatch) 6642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re; 6652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[0]->Decref(); 6662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[0] = NULL; 6672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->nsub() == 2) { 6682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Collapse concatenation to single regexp. 6692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* nre = sub[1]; 6702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[1] = NULL; 6712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 6722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return nre; 6732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 3 or more -> 2 or more. 6752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->nsub_--; 6762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 6772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re; 6782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags pf = re->parse_flags(); 6802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 6812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return new Regexp(kRegexpEmptyMatch, pf); 6822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the leading string that re starts with. 6852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The returned Rune* points into a piece of re, 6862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// so it must not be used after the caller calls re->Decref(). 6872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRune* Regexp::LeadingString(Regexp* re, int *nrune, 6882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags *flags) { 6892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (re->op() == kRegexpConcat && re->nsub() > 0) 6902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = re->sub()[0]; 6912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); 6932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpLiteral) { 6952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *nrune = 1; 6962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return &re->rune_; 6972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpLiteralString) { 7002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *nrune = re->nrunes_; 7012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return re->runes_; 7022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *nrune = 0; 7052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 7062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Removes the first n leading runes from the beginning of re. 7092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Edits re in place. 7102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Regexp::RemoveLeadingString(Regexp* re, int n) { 7112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Chase down concats to find first string. 7122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For regexps generated by parser, nested concats are 7132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // flattened except when doing so would overflow the 16-bit 7142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // limit on the size of a concatenation, so we should never 7152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // see more than two here. 7162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* stk[4]; 7172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int d = 0; 7182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (re->op() == kRegexpConcat) { 7192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (d < arraysize(stk)) 7202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[d++] = re; 7212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = re->sub()[0]; 7222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Remove leading string from re. 7252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpLiteral) { 7262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->rune_ = 0; 7272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->op_ = kRegexpEmptyMatch; 7282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (re->op() == kRegexpLiteralString) { 7292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n >= re->nrunes_) { 7302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] re->runes_; 7312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->runes_ = NULL; 7322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->nrunes_ = 0; 7332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->op_ = kRegexpEmptyMatch; 7342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (n == re->nrunes_ - 1) { 7352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune rune = re->runes_[re->nrunes_ - 1]; 7362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] re->runes_; 7372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->runes_ = NULL; 7382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->nrunes_ = 0; 7392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->rune_ = rune; 7402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->op_ = kRegexpLiteral; 7412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 7422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->nrunes_ -= n; 7432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); 7442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If re is now empty, concatenations might simplify too. 7482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (d-- > 0) { 7492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re = stk[d]; 7502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub = re->sub(); 7512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sub[0]->op() == kRegexpEmptyMatch) { 7522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[0]->Decref(); 7532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[0] = NULL; 7542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Delete first element of concat. 7552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (re->nsub()) { 7562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 0: 7572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 1: 7582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Impossible. 7592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Concat of " << re->nsub(); 7602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->submany_ = NULL; 7612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->op_ = kRegexpEmptyMatch; 7622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 7632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 2: { 7652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Replace re with sub[1]. 7662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* old = sub[1]; 7672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[1] = NULL; 7682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Swap(old); 7692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson old->Decref(); 7702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 7712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 7742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Slide down. 7752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->nsub_--; 7762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 7772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 7782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Factors common prefixes from alternation. 7842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For example, 7852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ABC|ABD|AEF|BCX|BCY 7862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// simplifies to 7872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A(B(C|D)|EF)|BC(X|Y) 7882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which the normal parse state routines will further simplify to 7892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A(B[CD]|EF)|BC[XY] 7902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 7912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Rewrites sub to contain simplified list to alternate and returns 7922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the new length of sub. Adjusts reference counts accordingly 7932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (incoming sub[i] decremented, outgoing sub[i] incremented). 7942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// It's too much of a pain to write this code with an explicit stack, 7962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// so instead we let the caller specify a maximum depth and 7972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// don't simplify beyond that. There are around 15 words of local 7982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// variables and parameters in the frame, so allowing 8 levels 7992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// on a 64-bit machine is still less than a kilobyte of stack and 8002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// probably enough benefit for practical uses. 8012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonconst int kFactorAlternationMaxDepth = 8; 8022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Regexp::FactorAlternation( 8042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub, int n, 8052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags altflags) { 8062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return FactorAlternationRecursive(sub, n, altflags, 8072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFactorAlternationMaxDepth); 8082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 8092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Regexp::FactorAlternationRecursive( 8112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub, int n, 8122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags altflags, 8132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int maxdepth) { 8142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (maxdepth <= 0) 8162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return n; 8172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Round 1: Factor out common literal prefixes. 8192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune *rune = NULL; 8202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nrune = 0; 8212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags runeflags = Regexp::NoParseFlags; 8222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start = 0; 8232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int out = 0; 8242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i <= n; i++) { 8252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: what was in sub[0:start] has been Decref'ed 8262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and that space has been reused for sub[0:out] (out <= start). 8272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 8282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: sub[start:i] consists of regexps that all begin 8292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // with the string rune[0:nrune]. 8302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune* rune_i = NULL; 8322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nrune_i = 0; 8332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; 8342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n) { 8352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); 8362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (runeflags_i == runeflags) { 8372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int same = 0; 8382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) 8392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson same++; 8402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (same > 0) { 8412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Matches at least one rune in current range. Keep going around. 8422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nrune = same; 8432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Found end of a run with common leading literal string: 8492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // sub[start:i] all begin with rune[0:nrune] but sub[i] 8502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // does not even begin with rune[0]. 8512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 8522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Factor out common string and append factored expression to sub[0:out]. 8532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i == start) { 8542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Nothing to do - first iteration. 8552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (i == start+1) { 8562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Just one: don't bother factoring. 8572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = sub[start]; 8582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 8592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Construct factored form: prefix(suffix1|suffix2|...) 8602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 8612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson x[0] = LiteralString(rune, nrune, runeflags); 8622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = start; j < i; j++) 8632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RemoveLeadingString(sub[j], nrune); 8642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 8652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxdepth - 1); 8662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson x[1] = AlternateNoFactor(sub + start, nn, altflags); 8672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = Concat(x, 2, altflags); 8682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Prepare for next round (if there is one). 8712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n) { 8722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = i; 8732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune = rune_i; 8742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nrune = nrune_i; 8752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson runeflags = runeflags_i; 8762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = out; 8792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Round 2: Factor out common complex prefixes, 8812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // just the first piece of each concatenation, 8822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // whatever it is. This is good enough a lot of the time. 8832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = 0; 8842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out = 0; 8852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* first = NULL; 8862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i <= n; i++) { 8872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: what was in sub[0:start] has been Decref'ed 8882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and that space has been reused for sub[0:out] (out <= start). 8892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 8902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: sub[start:i] consists of regexps that all begin with first. 8912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* first_i = NULL; 8932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n) { 8942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first_i = LeadingRegexp(sub[i]); 8952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (first != NULL && Regexp::Equal(first, first_i)) { 8962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Found end of a run with common leading regexp: 9012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // sub[start:i] all begin with first but sub[i] does not. 9022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 9032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Factor out common regexp and append factored expression to sub[0:out]. 9042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i == start) { 9052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Nothing to do - first iteration. 9062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (i == start+1) { 9072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Just one: don't bother factoring. 9082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = sub[start]; 9092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 9102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Construct factored form: prefix(suffix1|suffix2|...) 9112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 9122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson x[0] = first->Incref(); 9132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = start; j < i; j++) 9142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[j] = RemoveLeadingRegexp(sub[j]); 9152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 9162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxdepth - 1); 9172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson x[1] = AlternateNoFactor(sub + start, nn, altflags); 9182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = Concat(x, 2, altflags); 9192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Prepare for next round (if there is one). 9222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n) { 9232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = i; 9242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first = first_i; 9252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = out; 9282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Round 3: Collapse runs of single literals into character classes. 9302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = 0; 9312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out = 0; 9322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i <= n; i++) { 9332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: what was in sub[0:start] has been Decref'ed 9342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and that space has been reused for sub[0:out] (out <= start). 9352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 9362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Invariant: sub[start:i] consists of regexps that are either 9372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // literal runes or character classes. 9382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n && 9402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (sub[i]->op() == kRegexpLiteral || 9412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[i]->op() == kRegexpCharClass)) 9422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 9432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // sub[i] is not a char or char class; 9452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // emit char class for sub[start:i]... 9462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i == start) { 9472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Nothing to do. 9482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (i == start+1) { 9492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = sub[start]; 9502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 9512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Make new char class. 9522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClassBuilder ccb; 9532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = start; j < i; j++) { 9542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = sub[j]; 9552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->op() == kRegexpCharClass) { 9562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClass* cc = re->cc(); 9572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) 9582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ccb.AddRange(it->lo, it->hi); 9592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (re->op() == kRegexpLiteral) { 9602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); 9612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 9622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " 9632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << re->ToString(); 9642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 9662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); 9682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ... and then emit sub[i]. 9712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i < n) 9722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = sub[i]; 9732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = i+1; 9742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = out; 9762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Round 4: Collapse runs of empty matches into single empty match. 9782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = 0; 9792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out = 0; 9802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < n; i++) { 9812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i + 1 < n && 9822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[i]->op() == kRegexpEmptyMatch && 9832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[i+1]->op() == kRegexpEmptyMatch) { 9842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[i]->Decref(); 9852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 9862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub[out++] = sub[i]; 9882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = out; 9902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return n; 9922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Collapse the regexps on top of the stack, down to the 9952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// first marker, into a new op node (op == kRegexpAlternate 9962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// or op == kRegexpConcat). 9972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Regexp::ParseState::DoCollapse(RegexpOp op) { 9982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Scan backward to marker, counting children of composite. 9992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = 0; 10002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* next = NULL; 10012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* sub; 10022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 10032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = sub->down_; 10042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sub->op_ == op) 10052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n += sub->nsub_; 10062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 10072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n++; 10082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If there's just one child, leave it alone. 10112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (Concat of one thing is that one thing; alternate of one thing is same.) 10122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (stacktop_ != NULL && stacktop_->down_ == next) 10132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 10142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Construct op (alternation or concatenation), flattening op of op. 10162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** subs = new Regexp*[n]; 10172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = NULL; 10182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int i = n; 10192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 10202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = sub->down_; 10212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sub->op_ == op) { 10222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** sub_subs = sub->sub(); 10232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int k = sub->nsub_ - 1; k >= 0; k--) 10242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subs[--i] = sub_subs[k]->Incref(); 10252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub->Decref(); 10262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 10272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subs[--i] = FinishRegexp(sub); 10282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true); 10322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] subs; 10332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->simple_ = re->ComputeSimple(); 10342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->down_ = next; 10352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = re; 10362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Finishes the current concatenation, 10392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// collapsing it into a single regexp on the stack. 10402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Regexp::ParseState::DoConcatenation() { 10412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r1 = stacktop_; 10422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r1 == NULL || IsMarker(r1->op())) { 10432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // empty concatenation is special case 10442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); 10452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PushRegexp(re); 10462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoCollapse(kRegexpConcat); 10482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Finishes the current alternation, 10512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// collapsing it to a single regexp on the stack. 10522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Regexp::ParseState::DoAlternation() { 10532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoVerticalBar(); 10542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Now stack top is kVerticalBar. 10552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* r1 = stacktop_; 10562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = r1->down_; 10572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson r1->Decref(); 10582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DoCollapse(kRegexpAlternate); 10592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Incremental conversion of concatenated literals into strings. 10622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If top two elements on stack are both literal or string, 10632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// collapse into single string. 10642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Don't walk down the stack -- the parser calls this frequently 10652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// enough that below the bottom two is known to be collapsed. 10662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Only called when another regexp is about to be pushed 10672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// on the stack, so that the topmost literal is not being considered. 10682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (Otherwise ab* would turn into (ab)*.) 10692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If r >= 0, consider pushing a literal r on the stack. 10702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Return whether that happened. 10712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { 10722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re1; 10732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re2; 10742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) 10752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 10762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) 10782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 10792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) 10802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 10812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) 10822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 10832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re2->op_ == kRegexpLiteral) { 10852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // convert into string 10862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune rune = re2->rune_; 10872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->op_ = kRegexpLiteralString; 10882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->nrunes_ = 0; 10892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->runes_ = NULL; 10902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->AddRuneToString(rune); 10912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // push re1 into re2. 10942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re1->op_ == kRegexpLiteral) { 10952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->AddRuneToString(re1->rune_); 10962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 10972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < re1->nrunes_; i++) 10982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re2->AddRuneToString(re1->runes_[i]); 10992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->nrunes_ = 0; 11002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] re1->runes_; 11012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->runes_ = NULL; 11022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // reuse re1 if possible 11052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r >= 0) { 11062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->op_ = kRegexpLiteral; 11072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->rune_ = r; 11082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->parse_flags_ = flags; 11092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 11102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stacktop_ = re2; 11132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re1->Decref(); 11142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Lexing routines. 11182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a decimal integer, storing it in *n. 11202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 11212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *out_re to the regexp for the class. 11222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool ParseInteger(StringPiece* s, int* np) { 11232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) 11242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Disallow leading zeros. 11262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) 11272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = 0; 11292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c; 11302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { 11312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Avoid overflow. 11322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n >= 100000000) 11332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = n*10 + c - '0'; 11352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // digit 11362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *np = n; 11382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 11392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a repetition suffix like {1,2} or {2} or {2,}. 11422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string on success. 11432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *lo and *hi to the given range. 11442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In the case of {2,}, the high number is unbounded; 11452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// sets *hi to -1 to signify this. 11462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// {,2} is NOT a valid suffix. 11472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The Maybe in the name signifies that the regexp parse 11482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// doesn't fail even if ParseRepetition does, so the StringPiece 11492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// s must NOT be edited unless MaybeParseRepetition returns true. 11502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { 11512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece s = *sp; 11522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s.size() == 0 || s[0] != '{') 11532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s.remove_prefix(1); // '{' 11552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseInteger(&s, lo)) 11562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s.size() == 0) 11582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s[0] == ',') { 11602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s.remove_prefix(1); // ',' 11612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s.size() == 0) 11622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s[0] == '}') { 11642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // {2,} means at least 2 11652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *hi = -1; 11662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 11672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // {2,4} means 2, 3, or 4. 11682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseInteger(&s, hi)) 11692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 11722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // {2} means exactly two 11732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *hi = *lo; 11742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s.size() == 0 || s[0] != '}') 11762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 11772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s.remove_prefix(1); // '}' 11782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *sp = s; 11792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 11802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Removes the next Rune from the StringPiece and stores it in *r. 11832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns number of bytes removed from sp. 11842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Behaves as though there is a terminating NUL at the end of sp. 11852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Argument order is backwards from usual Google style 11862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but consistent with chartorune. 11872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { 11882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n; 11892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (fullrune(sp->data(), sp->size())) { 11902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n = chartorune(r, sp->data()); 11912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(n == 1 && *r == Runeerror)) { // no decoding error 11922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sp->remove_prefix(n); 11932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return n; 11942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadUTF8); 11982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(NULL); 11992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 12002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Return whether name is valid UTF-8. 12032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If not, set status to kRegexpBadUTF8. 12042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { 12052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece t = s; 12062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 12072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (t.size() > 0) { 12082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&r, &t, status) < 0) 12092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 12102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 12122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is c a hex digit? 12152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic int IsHex(int c) { 12162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ('0' <= c && c <= '9') || 12172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('A' <= c && c <= 'F') || 12182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('a' <= c && c <= 'f'); 12192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Convert hex digit to value. 12222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic int UnHex(int c) { 12232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ('0' <= c && c <= '9') 12242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return c - '0'; 12252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ('A' <= c && c <= 'F') 12262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return c - 'A' + 10; 12272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ('a' <= c && c <= 'f') 12282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return c - 'a' + 10; 12292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Bad hex digit " << c; 12302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 12312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parse an escape sequence (e.g., \n, \{). 12342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 12352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *rp to the named character. 12362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool ParseEscape(StringPiece* s, Rune* rp, 12372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status, int rune_max) { 12382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* begin = s->begin(); 12392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() < 1 || (*s)[0] != '\\') { 12402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Should not happen - caller always checks. 12412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpInternalError); 12422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(NULL); 12432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 12442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() < 2) { 12462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpTrailingBackslash); 12472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(NULL); 12482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 12492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune c, c1; 12512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // backslash 12522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c, s, status) < 0) 12532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 12542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int code; 12552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (c) { 12562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 12572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c < Runeself && !isalpha(c) && !isdigit(c)) { 12582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Escaped non-word characters are always themselves. 12592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // PCRE is not quite so rigorous: it accepts things like 12602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // \q, but we don't. We once rejected \_, but too many 12612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // programs and people insist on using it, so allow \_. 12622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = c; 12632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 12642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 12662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Octal escapes. 12682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '1': 12692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '2': 12702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '3': 12712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '4': 12722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '5': 12732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '6': 12742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '7': 12752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Single non-zero octal digit is a backreference; not supported. 12762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') 12772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 12782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 12792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '0': 12802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // consume up to three octal digits; already have one. 12812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code = c - '0'; 12822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { 12832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code = code * 8 + c - '0'; 12842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // digit 12852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() > 0) { 12862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = (*s)[0]; 12872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ('0' <= c && c <= '7') { 12882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code = code * 8 + c - '0'; 12892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // digit 12902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = code; 12942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 12952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Hexadecimal escapes 12972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'x': 12982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0) 12992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c, s, status) < 0) 13012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 13022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c == '{') { 13032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Any number of digits in braces. 13042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Update n as we consume the string, so that 13052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the whole thing gets shown in the error message. 13062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Perl accepts any text at all; it ignores all text 13072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // after the first non-hex digit. We require only hex digits, 13082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and at least one. 13092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c, s, status) < 0) 13102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 13112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nhex = 0; 13122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code = 0; 13132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (IsHex(c)) { 13142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nhex++; 13152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code = code * 16 + UnHex(c); 13162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (code > rune_max) 13172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0) 13192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c, s, status) < 0) 13212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 13222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c != '}' || nhex == 0) 13242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = code; 13262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Easy case: two hex digits. 13292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0) 13302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c1, s, status) < 0) 13322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 13332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsHex(c) || !IsHex(c1)) 13342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadEscape; 13352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = UnHex(c) * 16 + UnHex(c1); 13362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // C escapes. 13392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'n': 13402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\n'; 13412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'r': 13432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\r'; 13442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 't': 13462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\t'; 13472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Less common C escapes. 13502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'a': 13512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\a'; 13522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'f': 13542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\f'; 13552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'v': 13572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *rp = '\v'; 13582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This code is disabled to avoid misparsing 13612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the Perl word-boundary \b as a backspace 13622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // when in POSIX regexp mode. Surprisingly, 13632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in Perl, \b means word-boundary but [\b] 13642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // means backspace. We don't support that: 13652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if you want a backspace embed a literal 13662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // backspace character or use \x08. 13672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 13682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // case 'b': 13692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // *rp = '\b'; 13702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // return true; 13712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Not reached in ParseEscape."; 13742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBadEscape: 13762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Unrecognized escape sequence. 13772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadEscape); 13782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(StringPiece(begin, s->data() - begin)); 13792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 13802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 13812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Add a range to the character class, but exclude newline if asked. 13832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Also handle case folding. 13842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid CharClassBuilder::AddRangeFlags( 13852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { 13862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Take out \n if the flags say so. 13882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool cutnl = !(parse_flags & Regexp::ClassNL) || 13892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (parse_flags & Regexp::NeverNL); 13902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (cutnl && lo <= '\n' && '\n' <= hi) { 13912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo < '\n') 13922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRangeFlags(lo, '\n' - 1, parse_flags); 13932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi > '\n') 13942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRangeFlags('\n' + 1, hi, parse_flags); 13952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 13962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If folding case, add fold-equivalent characters too. 13992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (parse_flags & Regexp::FoldCase) 14002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddFoldedRange(this, lo, hi, 0); 14012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 14022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRange(lo, hi); 14032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Look for a group with the given name. 14062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic UGroup* LookupGroup(const StringPiece& name, 14072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *groups, int ngroups) { 14082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Simple name lookup. 14092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < ngroups; i++) 14102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPiece(groups[i].name) == name) 14112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return &groups[i]; 14122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 14132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Fake UGroup containing all Runes 14162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic URange16 any16[] = { { 0, 65535 } }; 14172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic URange32 any32[] = { { 65536, Runemax } }; 14182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; 14192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Look for a POSIX group with the given name (e.g., "[:^alpha:]") 14212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic UGroup* LookupPosixGroup(const StringPiece& name) { 14222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return LookupGroup(name, posix_groups, num_posix_groups); 14232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic UGroup* LookupPerlGroup(const StringPiece& name) { 14262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return LookupGroup(name, perl_groups, num_perl_groups); 14272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Look for a Unicode group with the given name (e.g., "Han") 14302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic UGroup* LookupUnicodeGroup(const StringPiece& name) { 14312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Special case: "Any" means any. 14322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (name == StringPiece("Any")) 14332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return &anygroup; 14342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return LookupGroup(name, unicode_groups, num_unicode_groups); 14352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Add a UGroup or its negation to the character class. 14382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, 14392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags parse_flags) { 14402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sign == +1) { 14412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < g->nr16; i++) { 14422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); 14432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < g->nr32; i++) { 14452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); 14462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 14482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (parse_flags & Regexp::FoldCase) { 14492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Normally adding a case-folded group means 14502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // adding all the extra fold-equivalent runes too. 14512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // But if we're adding the negation of the group, 14522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // we have to exclude all the runes that are fold-equivalent 14532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to what's already missing. Too hard, so do in two steps. 14542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClassBuilder ccb1; 14552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddUGroup(&ccb1, g, +1, parse_flags); 14560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // If the flags say to take out \n, put it in, so that negating will take it out. 14570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. 14580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin bool cutnl = !(parse_flags & Regexp::ClassNL) || 14590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin (parse_flags & Regexp::NeverNL); 14600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (cutnl) { 14610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ccb1.AddRange('\n', '\n'); 14620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 14632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ccb1.Negate(); 14642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddCharClass(&ccb1); 14652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 14662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int next = 0; 14682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < g->nr16; i++) { 14692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (next < g->r16[i].lo) 14702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); 14712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = g->r16[i].hi + 1; 14722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < g->nr32; i++) { 14742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (next < g->r32[i].lo) 14752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); 14762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = g->r32[i].hi + 1; 14772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (next <= Runemax) 14792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cc->AddRangeFlags(next, Runemax, parse_flags); 14802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 14822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Maybe parse a Perl character class escape sequence. 14842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Only recognizes the Perl character classes (\d \s \w \D \S \W), 14852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// not the Perl empty-string classes (\b \B \A \Z \z). 14862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// On success, sets *s to span the remainder of the string 14872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and returns the corresponding UGroup. 14882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The StringPiece must *NOT* be edited unless the call succeeds. 14892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonUGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { 14902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(parse_flags & Regexp::PerlClasses)) 14912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 14922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() < 2 || (*s)[0] != '\\') 14932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 14942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Could use StringPieceToRune, but there aren't 14952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // any non-ASCII Perl group names. 14962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece name(s->begin(), 2); 14972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *g = LookupPerlGroup(name); 14982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (g == NULL) 14992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 15002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(name.size()); 15012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return g; 15022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum ParseStatus { 15052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kParseOk, // Did some parsing. 15062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kParseError, // Found an error. 15072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kParseNothing, // Decided not to parse. 15082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 15092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Maybe parses a Unicode character group like \p{Han} or \P{Han} 15112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (the latter is a negated group). 15122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, 15132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClassBuilder *cc, 15142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 15152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Decide whether to parse. 15162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(parse_flags & Regexp::UnicodeGroups)) 15172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseNothing; 15182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() < 2 || (*s)[0] != '\\') 15192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseNothing; 15202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune c = (*s)[1]; 15212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c != 'p' && c != 'P') 15222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseNothing; 15232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Committed to parse. Results: 15252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int sign = +1; // -1 = negated char class 15262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c == 'P') 15272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sign = -1; 15282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece seq = *s; // \p{Han} or \pL 15292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece name; // Han or L 15302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(2); // '\\', 'p' 15312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!StringPieceToRune(&c, s, status)) 15332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 15342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c != '{') { 15352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Name is the bit of string we just skipped over for c. 15362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p = seq.begin() + 2; 15372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson name = StringPiece(p, s->begin() - p); 15382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 15392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Name is in braces. Look for closing } 15402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int end = s->find('}', 0); 15412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (end == s->npos) { 15422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsValidUTF8(seq, status)) 15432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 15442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadCharRange); 15452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(seq); 15462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 15472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson name = StringPiece(s->begin(), end); // without '}' 15492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(end + 1); // with '}' 15502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsValidUTF8(name, status)) 15512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 15522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Chop seq where s now begins. 15552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson seq = StringPiece(seq.begin(), s->begin() - seq.begin()); 15562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look up group 15582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (name.size() > 0 && name[0] == '^') { 15592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sign = -sign; 15602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson name.remove_prefix(1); // '^' 15612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *g = LookupUnicodeGroup(name); 15632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (g == NULL) { 15642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadCharRange); 15652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(seq); 15662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 15672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddUGroup(cc, g, sign, parse_flags); 15702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseOk; 15712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a character class name like [:alnum:]. 15742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 15752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Adds the ranges corresponding to the class to ranges. 15762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, 15772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClassBuilder *cc, 15782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 15792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Check begins with [: 15802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p = s->data(); 15812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* ep = s->data() + s->size(); 15822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ep - p < 2 || p[0] != '[' || p[1] != ':') 15832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseNothing; 15842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look for closing :]. 15862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* q; 15872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) 15882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ; 15892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If no closing :], then ignore. 15912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q > ep-2) 15922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseNothing; 15932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Got it. Check that it's valid. 15952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q += 2; 15962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece name(p, q-p); 15972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *g = LookupPosixGroup(name); 15992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (g == NULL) { 16002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadCharRange); 16012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(name); 16022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseError; 16032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(name.size()); 16062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddUGroup(cc, g, g->sign, parse_flags); 16072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kParseOk; 16082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 16092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a character inside a character class. 16112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// There are fewer special characters here than in the rest of the regexp. 16122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 16132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *rp to the character. 16142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, 16152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& whole_class, 16162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 16172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0) { 16182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpMissingBracket); 16192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(whole_class); 16202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Allow regular escape sequences even though 16242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // many need not be escaped in this context. 16252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() >= 1 && (*s)[0] == '\\') 16262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ParseEscape(s, rp, status, rune_max_); 16272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Otherwise take the next rune. 16292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPieceToRune(rp, s, status) >= 0; 16302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 16312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a character class character, or, if the character 16332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is followed by a hyphen, parses a character class range. 16342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For single characters, rr->lo == rr->hi. 16352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 16362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *rp to the character. 16372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, 16382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& whole_class, 16392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 16402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece os = *s; 16412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) 16422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // [a-] means (a|-), so check for final ]. 16442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { 16452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // '-' 16462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) 16472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (rr->hi < rr->lo) { 16492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadCharRange); 16502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); 16512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 16542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rr->hi = rr->lo; 16552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 16572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 16582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. 16602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *s to span the remainder of the string. 16612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Sets *out_re to the regexp for the class. 16622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::ParseCharClass(StringPiece* s, 16632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** out_re, 16642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 16652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece whole_class = *s; 16662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0 || (*s)[0] != '[') { 16672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Caller checked this. 16682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpInternalError); 16692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(NULL); 16702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool negated = false; 16732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 16742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = new CharClassBuilder; 16752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // '[' 16762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() > 0 && (*s)[0] == '^') { 16772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // '^' 16782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson negated = true; 16792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { 16802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If NL can't match implicitly, then pretend 16812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // negated classes include a leading \n. 16822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->AddRange('\n', '\n'); 16832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool first = true; // ] is okay as first char in class 16862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (s->size() > 0 && ((*s)[0] != ']' || first)) { 16872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // - is only okay unescaped as first or last in class. 16882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Except that Perl allows - anywhere. 16892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((*s)[0] == '-' && !first && !(flags_&PerlX) && 16902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (s->size() == 1 || (*s)[1] != ']')) { 16912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece t = *s; 16922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '-' 16932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 16942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = StringPieceToRune(&r, &t, status); 16952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n < 0) { 16962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 16972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpBadCharRange); 17002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(StringPiece(s->data(), 1+n)); 17012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 17022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first = false; 17052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look for [:alnum:] etc. 17072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { 17082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ParseCCName(s, flags_, re->ccb_, status)) { 17092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseOk: 17102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 17112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseError: 17122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 17132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseNothing: 17152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 17162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look for Unicode character group like \p{Han} 17202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() > 2 && 17212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (*s)[0] == '\\' && 17222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ((*s)[1] == 'p' || (*s)[1] == 'P')) { 17232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { 17242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseOk: 17252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 17262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseError: 17272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 17282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseNothing: 17302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 17312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look for Perl character class symbols (extension). 17352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *g = MaybeParsePerlCCEscape(s, flags_); 17362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (g != NULL) { 17372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddUGroup(re->ccb_, g, g->sign, flags_); 17382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 17392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Otherwise assume single character or simple range. 17422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RuneRange rr; 17432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseCCRange(s, &rr, whole_class, status)) { 17442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 17452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // AddRangeFlags is usually called in response to a class like 17482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // \p{Foo} or [[:foo:]]; for those, it filters \n out unless 17492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Regexp::ClassNL is set. In an explicit range or singleton 17502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // like we just parsed, we do not filter \n out, so set ClassNL 17512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in the flags. 17522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); 17532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->size() == 0) { 17552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpMissingBracket); 17562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(whole_class); 17572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 17582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(1); // ']' 17612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 17632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->Negate(); 17642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_->RemoveAbove(rune_max_); 17652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *out_re = re; 17672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 17692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is this a valid capture name? [A-Za-z0-9_]+ 17712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// PCRE limits names to 32 bytes. 17722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Python rejects names starting with digits. 17732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// We don't enforce either of those. 17742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsValidCaptureName(const StringPiece& name) { 17752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (name.size() == 0) 17762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < name.size(); i++) { 17782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c = name[i]; 17792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (('0' <= c && c <= '9') || 17802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('a' <= c && c <= 'z') || 17812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('A' <= c && c <= 'Z') || 17822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c == '_') 17832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 17842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 17882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses a Perl flag setting or non-capturing group or both, 17902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// like (?i) or (?: or (?i:. Removes from s, updates parse state. 17912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The caller must check that s begins with "(?". 17922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns true on success. If the Perl flag is not 17932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// well-formed or not supported, sets status_ and returns false. 17942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { 17952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece t = *s; 17962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Caller is supposed to check this. 17982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { 17992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; 18002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpInternalError); 18012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // "(?" 18052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Check for named captures, first introduced in Python's regexp library. 18072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // As usual, there are three slightly different syntaxes: 18082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 18092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (?P<name>expr) the original, introduced by Python 18102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (?<name>expr) the .NET alteration, adopted by Perl 5.10 18112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (?'name'expr) another .NET alteration, adopted by Perl 5.10 18122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 18132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Perl 5.10 gave in and implemented the Python version too, 18142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // but they claim that the last two are the preferred forms. 18152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // PCRE and languages based on it (specifically, PHP and Ruby) 18162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // support all three as well. EcmaScript 4 uses only the Python form. 18172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 18182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In both the open source world (via Code Search) and the 18192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Google source tree, (?P<expr>name) is the dominant form, 18202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so that's the one we implement. One is enough. 18212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { 18222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pull out name. 18232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int end = t.find('>', 2); 18242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (end == t.npos) { 18252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsValidUTF8(*s, status_)) 18262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpBadNamedCapture); 18282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(*s); 18292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // t is "P<name>...", t[end] == '>' 18332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece capture(t.begin()-2, end+3); // "(?P<name>" 18342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece name(t.begin()+2, end-2); // "name" 18352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsValidUTF8(name, status_)) 18362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!IsValidCaptureName(name)) { 18382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpBadNamedCapture); 18392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(capture); 18402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!DoLeftParen(name)) { 18442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // DoLeftParen's failure set status_. 18452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->remove_prefix(capture.end() - s->begin()); 18492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 18502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool negated = false; 18532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool sawflags = false; 18542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nflags = flags_; 18552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune c; 18562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (bool done = false; !done; ) { 18572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() == 0) 18582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadPerlOp; 18592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&c, &t, status_) < 0) 18602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (c) { 18622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 18632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadPerlOp; 18642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Parse flags. 18662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'i': 18672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawflags = true; 18682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 18692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags &= ~FoldCase; 18702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 18712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags |= FoldCase; 18722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 18732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'm': // opposite of our OneLine 18752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawflags = true; 18762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 18772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags |= OneLine; 18782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 18792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags &= ~OneLine; 18802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 18812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 's': 18832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawflags = true; 18842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 18852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags &= ~DotNL; 18862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 18872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags |= DotNL; 18882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 18892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 'U': 18912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawflags = true; 18922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 18932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags &= ~NonGreedy; 18942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 18952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nflags |= NonGreedy; 18962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 18972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Negation 18992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '-': 19002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated) 19012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadPerlOp; 19022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson negated = true; 19032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawflags = false; 19042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 19052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Open new group. 19072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case ':': 19082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!DoLeftParenNoCapture()) { 19092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // DoLeftParenNoCapture's failure set status_. 19102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson done = true; 19132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 19142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finish flags. 19162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case ')': 19172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson done = true; 19182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 19192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (negated && !sawflags) 19232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto BadPerlOp; 19242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags_ = static_cast<Regexp::ParseFlags>(nflags); 19262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *s = t; 19272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 19282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBadPerlOp: 19302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_code(kRegexpBadPerlOp); 19312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); 19322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 19342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Converts latin1 (assumed to be encoded as Latin1 bytes) 19362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// into UTF8 encoding in string. 19372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is 19382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// deprecated and because it rejects code points 0x80-0x9F. 19392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { 19402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson char buf[UTFmax]; 19412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson utf->clear(); 19432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < latin1.size(); i++) { 19442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r = latin1[i] & 0xFF; 19452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = runetochar(buf, &r); 19462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson utf->append(buf, n); 19472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 19492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Parses the regular expression given by s, 19512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// returning the corresponding Regexp tree. 19522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The caller must Decref the return value when done with it. 19532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns NULL on error. 19542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonRegexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, 19552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus* status) { 19562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Make status non-NULL (easier on everyone else). 19572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpStatus xstatus; 19582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (status == NULL) 19592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status = &xstatus; 19602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ParseState ps(global_flags, s, status); 19622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece t = s; 19632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Convert regexp to UTF-8 (easier on the rest of the parser). 19652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (global_flags & Latin1) { 19662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string* tmp = new string; 19672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ConvertLatin1ToUTF8(t, tmp); 19682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_tmp(tmp); 19692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t = *tmp; 19702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (global_flags & Literal) { 19732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Special parse loop for literal string. 19742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (t.size() > 0) { 19752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 19762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&r, &t, status) < 0) 19772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 19782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushLiteral(r)) 19792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 19802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ps.DoFinish(); 19822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece lastunary = NULL; 19852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (t.size() > 0) { 19862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece isunary = NULL; 19872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (t[0]) { 19882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: { 19892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 19902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&r, &t, status) < 0) 19912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 19922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushLiteral(r)) 19932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 19942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 19952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '(': 19982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // "(?" introduces Perl escape. 19992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { 20002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Flag changes and non-capturing groups. 20012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.ParsePerlFlags(&t)) 20022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20050d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (ps.flags() & NeverCapture) { 20060d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!ps.DoLeftParenNoCapture()) 20070d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return NULL; 20080d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } else { 20090d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!ps.DoLeftParen(NULL)) 20100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return NULL; 20110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 20122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '(' 20132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '|': 20162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.DoVerticalBar()) 20172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '|' 20192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case ')': 20222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.DoRightParen()) 20232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // ')' 20252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '^': // Beginning of line. 20282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushCarat()) 20292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '^' 20312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '$': // End of line. 20342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushDollar()) 20352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '$' 20372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '.': // Any character (possibly except newline). 20402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushDot()) 20412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '.' 20432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '[': { // Character class. 20462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re; 20472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.ParseCharClass(&t, &re, status)) 20482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushRegexp(re)) 20502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '*': { // Zero or more. 20552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RegexpOp op; 20562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson op = kRegexpStar; 20572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto Rep; 20582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '+': // One or more. 20592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson op = kRegexpPlus; 20602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto Rep; 20612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '?': // Zero or one. 20622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson op = kRegexpQuest; 20632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto Rep; 20642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rep: 20652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece opstr = t; 20662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool nongreedy = false; 20672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '*' or '+' or '?' 20682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ps.flags() & PerlX) { 20692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() > 0 && t[0] == '?') { 20702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nongreedy = true; 20712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '?' 20722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lastunary.size() > 0) { 20742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In Perl it is not allowed to stack repetition operators: 20752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a** is a syntax error, not a double-star. 20762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (and a++ means something else entirely, which we don't support!) 20772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpRepeatOp); 20782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(StringPiece(lastunary.begin(), 20792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.begin() - lastunary.begin())); 20802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson opstr.set(opstr.data(), t.data() - opstr.data()); 20842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushRepeatOp(op, opstr, nongreedy)) 20852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson isunary = opstr; 20872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '{': { // Counted repetition. 20912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int lo, hi; 20922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece opstr = t; 20932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!MaybeParseRepetition(&t, &lo, &hi)) { 20942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Treat like a literal. 20952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushLiteral('{')) 20962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 20972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '{' 20982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool nongreedy = false; 21012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ps.flags() & PerlX) { 21022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() > 0 && t[0] == '?') { 21032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nongreedy = true; 21042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(1); // '?' 21052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lastunary.size() > 0) { 21072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Not allowed to stack repetition operators. 21082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_code(kRegexpRepeatOp); 21092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson status->set_error_arg(StringPiece(lastunary.begin(), 21102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.begin() - lastunary.begin())); 21112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson opstr.set(opstr.data(), t.data() - opstr.data()); 21152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) 21162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson isunary = opstr; 21182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case '\\': { // Escaped character or Perl sequence. 21222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // \b and \B: word boundary or not 21232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((ps.flags() & Regexp::PerlB) && 21242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { 21252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushWordBoundary(t[1] == 'b')) 21262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'b' 21282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { 21322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t[1] == 'A') { 21332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushSimpleOp(kRegexpBeginText)) 21342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'A' 21362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t[1] == 'z') { 21392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushSimpleOp(kRegexpEndText)) 21402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'z' 21422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Do not recognize \Z, because this library can't 21452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // implement the exact Perl/PCRE semantics. 21462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (This library treats "(?-m)$" as \z, even though 21472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in Perl and PCRE it is equivalent to \Z.) 21482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t[1] == 'C') { // \C: any byte [sic] 21502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushSimpleOp(kRegexpAnyByte)) 21512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'C' 21532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t[1] == 'Q') { // \Q ... \E: the ... is always literals 21572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'Q' 21582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (t.size() > 0) { 21592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { 21602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t.remove_prefix(2); // '\\', 'E' 21612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 21642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (StringPieceToRune(&r, &t, status) < 0) 21652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushLiteral(r)) 21672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { 21742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 21752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = new CharClassBuilder; 21762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { 21772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseOk: 21782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushRegexp(re)) 21792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto Break2; 21812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseError: 21822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 21832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kParseNothing: 21852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 21862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); 21912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (g != NULL) { 21922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 21932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->ccb_ = new CharClassBuilder; 21942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddUGroup(re->ccb_, g, g->sign, ps.flags()); 21952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushRegexp(re)) 21962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 21972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 21982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 22002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune r; 22012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ParseEscape(&t, &r, status, ps.rune_max())) 22022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 22032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ps.PushLiteral(r)) 22042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 22052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 22062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 22072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 22082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Break2: 22092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastunary = isunary; 22102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 22112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ps.DoFinish(); 22122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 22132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 22142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 2215