12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2006 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// --- SPONSORED LINK --------------------------------------------------
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If you want to use this library for regular expression matching,
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// you should use re2/re2.h, which provides a class RE2 that
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// mimics the PCRE interface provided by PCRE's C++ wrappers.
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This header describes the low-level interface used to implement RE2
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and may change in backwards-incompatible ways from time to time.
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In contrast, RE2's interface will not.
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ---------------------------------------------------------------------
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regular expression library: parsing, execution, and manipulation
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of regular expressions.
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Any operation that traverses the Regexp structures should be written
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// regular expressions such as x++++++++++++++++++++... might cause recursive
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// traversals to overflow the stack.
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// It is the caller's responsibility to provide appropriate mutual exclusion
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// around manipulation of the regexps.  RE2 does this.
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// PARSING
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regexp::Parse parses regular expressions encoded in UTF-8.
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The default syntax is POSIX extended regular expressions,
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// with the following changes:
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//   1.  Backreferences (optional in POSIX EREs) are not supported.
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//         (Supporting them precludes the use of DFA-based
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//          matching engines.)
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//   2.  Collating elements and collation classes are not supported.
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//         (No one has needed or wanted them.)
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The exact syntax accepted can be modified by passing flags to
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Regexp::Parse.  In particular, many of the basic Perl additions
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// are available.  The flags are documented below (search for LikePerl).
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If parsed with the flag Regexp::Latin1, both the regular expression
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and the input to the matching routines are assumed to be encoded in
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Latin-1, not UTF-8.
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// EXECUTION
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Once Regexp has parsed a regular expression, it provides methods
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to search text using that regular expression.  These methods are
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implemented via calling out to other regular expression libraries.
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (Let's call them the sublibraries.)
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// To call a sublibrary, Regexp does not simply prepare a
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// string version of the regular expression and hand it to the
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// corresponding internal representation used by the sublibrary.
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This has the drawback of needing to know the internal representation
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// used by the sublibrary, but it has two important benefits:
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//   1. The syntax and meaning of regular expressions is guaranteed
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      to be that used by Regexp's parser, not the syntax expected
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      by the sublibrary.  Regexp might accept a restricted or
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      expanded syntax for regular expressions as compared with
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      the sublibrary.  As long as Regexp can translate from its
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      internal form into the sublibrary's, clients need not know
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      exactly which sublibrary they are using.
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//   2. The sublibrary parsers are bypassed.  For whatever reason,
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      sublibrary regular expression parsers often have security
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      problems.  For example, plan9grep's regular expression parser
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      has a buffer overflow in its handling of large character
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      classes, and PCRE's parser has had buffer overflow problems
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      in the past.  Security-team requires sandboxing of sublibrary
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      regular expression parsers.  Avoiding the sublibrary parsers
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//      avoids the sandbox.
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The execution methods we use now are provided by the compiled form,
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog, described in prog.h
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// MANIPULATION
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Unlike other regular expression libraries, Regexp makes its parsed
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// form accessible to clients, so that client code can analyze the
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// parsed regular expressions.
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#ifndef RE2_REGEXP_H__
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define RE2_REGEXP_H__
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h"
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/stringpiece.h"
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum RegexpOp {
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches no strings.
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpNoMatch = 1,
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches empty string.
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpEmptyMatch,
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches rune_.
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpLiteral,
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches runes_.
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpLiteralString,
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches concatenation of sub_[0..nsub-1].
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpConcat,
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches union of sub_[0..nsub-1].
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpAlternate,
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches sub_[0] zero or more times.
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpStar,
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches sub_[0] one or more times.
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpPlus,
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches sub_[0] zero or one times.
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpQuest,
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches sub_[0] at least min_ times, at most max_ times.
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // max_ == -1 means no upper limit.
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpRepeat,
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Parenthesized (capturing) subexpression.  Index is cap_.
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Optionally, capturing name is name_.
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpCapture,
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches any character.
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpAnyChar,
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches any byte [sic].
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpAnyByte,
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches empty string at beginning of line.
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBeginLine,
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches empty string at end of line.
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpEndLine,
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches word boundary "\b".
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpWordBoundary,
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches not-a-word boundary "\B".
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpNoWordBoundary,
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches empty string at beginning of text.
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBeginText,
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches empty string at end of text.
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpEndText,
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matches character class given by cc_.
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpCharClass,
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Forces match of entire expression right now,
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // with match ID match_id_ (used by RE2::Set).
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpHaveMatch,
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kMaxRegexpOp = kRegexpHaveMatch,
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Keep in sync with string list in regexp.cc
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum RegexpStatusCode {
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // No error
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpSuccess = 0,
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Unexpected error
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpInternalError,
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Parse errors
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadEscape,          // bad escape sequence
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadCharClass,       // bad character class
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadCharRange,       // bad character class range
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpMissingBracket,     // missing closing ]
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpMissingParen,       // missing closing )
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpTrailingBackslash,  // at end of regexp
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpRepeatSize,         // bad repetition argument
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpRepeatOp,           // bad repetition operator
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadPerlOp,          // bad perl operator
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadUTF8,            // invalid UTF-8 in regexp
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kRegexpBadNamedCapture,    // bad named capture
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Error status for certain operations.
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass RegexpStatus {
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~RegexpStatus() { delete tmp_; }
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_code(enum RegexpStatusCode code) { code_ = code; }
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum RegexpStatusCode code() const { return code_; }
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const StringPiece& error_arg() const { return error_arg_; }
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool ok() const { return code() == kRegexpSuccess; }
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Copies state from status.
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Copy(const RegexpStatus& status);
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns text equivalent of code, e.g.:
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   "Bad character class"
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const string& CodeText(enum RegexpStatusCode code);
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns text describing error, e.g.:
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   "Bad character class: [z-a]"
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string Text() const;
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum RegexpStatusCode code_;  // Kind of error
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece error_arg_;       // Piece of regexp containing syntax error.
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string* tmp_;                 // Temporary storage, possibly where error_arg_ is.
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Walker to implement Simplify.
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass SimplifyWalker;
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled form; see prog.h
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Prog;
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct RuneRange {
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RuneRange() : lo(0), hi(0) { }
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RuneRange(int l, int h) : lo(l), hi(h) { }
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Rune lo;
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Rune hi;
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Less-than on RuneRanges treats a == b if they overlap at all.
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This lets us look in a set to find the range covering a particular Rune.
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct RuneRangeLess {
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool operator()(const RuneRange& a, const RuneRange& b) const {
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return a.hi < b.lo;
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass CharClassBuilder;
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass CharClass {
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Delete();
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  typedef RuneRange* iterator;
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  iterator begin() { return ranges_; }
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  iterator end() { return ranges_ + nranges_; }
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int size() { return nrunes_; }
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool empty() { return nrunes_ == 0; }
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool full() { return nrunes_ == Runemax+1; }
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool FoldsASCII() { return folds_ascii_; }
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool Contains(Rune r);
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClass* Negate();
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClass();  // not implemented
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~CharClass();  // not implemented
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static CharClass* New(int maxranges);
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend class CharClassBuilder;
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool folds_ascii_;
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nrunes_;
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RuneRange *ranges_;
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nranges_;
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(CharClass);
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Regexp {
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Flags for parsing.  Can be ORed together.
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum ParseFlags {
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    NoParseFlags = 0,
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    FoldCase     = 1<<0,   // Fold case during matching (case-insensitive).
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Literal      = 1<<1,   // Treat s as literal string instead of a regexp.
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    ClassNL      = 1<<2,   // Allow char classes like [^a-z] and \D and \s
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           // and [[:space:]] to match newline.
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DotNL        = 1<<3,   // Allow . to match newline.
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    MatchNL      = ClassNL | DotNL,
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    OneLine      = 1<<4,   // Treat ^ and $ as only matching at beginning and
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           // end of text, not around embedded newlines.
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           // (Perl's default)
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Latin1       = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    NonGreedy    = 1<<6,   // Repetition operators are non-greedy by default.
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    PerlClasses  = 1<<7,   // Allow Perl character classes like \d.
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    PerlB        = 1<<8,   // Allow Perl's \b and \B.
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    PerlX        = 1<<9,   // Perl extensions:
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   non-capturing parens - (?: )
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   non-greedy operators - *? +? ?? {}?
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   flag edits - (?i) (?-i) (?i: )
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //     i - FoldCase
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //     m - !OneLine
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //     s - DotNL
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //     U - NonGreedy
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   line ends: \A \z
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   \Q and \E to disable/enable metacharacters
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   (?P<name>expr) for named captures
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   \C to match any single byte
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   and \P{Han} for its negation.
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           //   it explicitly.
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // As close to Perl as we can get.
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LikePerl     = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                   UnicodeGroups,
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Internal use only.
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    WasDollar    = 1<<15,  // on kRegexpEndText: was $ in regexp text
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Get.  No set, Regexps are logically immutable once created.
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RegexpOp op() { return static_cast<RegexpOp>(op_); }
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nsub() { return nsub_; }
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool simple() { return simple_; }
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int Ref();  // For testing.
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Regexp** sub() {
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if(nsub_ <= 1)
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return &subone_;
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return submany_;
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Increments reference count, returns object as convenience.
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Regexp* Incref();
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Decrements reference count and deletes this object if count reaches 0.
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Decref();
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Parses string s to produce regular expression, returned.
3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Caller must release return value with re->Decref().
3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // On failure, sets *status (if status != NULL) and returns NULL.
3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Parse(const StringPiece& s, ParseFlags flags,
3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                       RegexpStatus* status);
3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns a _new_ simplified version of the current regexp.
3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Does not edit the current regexp.
3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Caller must release return value with re->Decref().
3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Simplified means that counted repetition has been rewritten
3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // into simpler terms and all Perl/POSIX features have been
3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // removed.  The result will capture exactly the same
3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // subexpressions the original did, unless formatted with ToString.
3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Regexp* Simplify();
3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend class SimplifyWalker;
3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Parses the regexp src and then simplifies it and sets *dst to the
3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // string representation of the simplified form.  Returns true on success.
3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns false and sets *status (if status != NULL) on parse error.
3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             string* dst,
3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             RegexpStatus* status);
3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns the number of capturing groups in the regexp.
3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int NumCaptures();
3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend class NumCapturesWalker;
3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns a map from names to capturing group indices,
3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // or NULL if the regexp contains no named capture groups.
3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The caller is responsible for deleting the map.
3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  map<string, int>* NamedCaptures();
3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns a map from capturing group indices to capturing group
3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // names or NULL if the regexp contains no named capture groups. The
3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // caller is responsible for deleting the map.
3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  map<int, string>* CaptureNames();
3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns a string representation of the current regexp,
3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // using as few parentheses as possible.
3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string ToString();
3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Convenience functions.  They consume the passed reference,
3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // so in many cases you should use, e.g., Plus(re->Incref(), flags).
3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // They do not consume allocated arrays like subs or runes.
3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Plus(Regexp* sub, ParseFlags flags);
3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Star(Regexp* sub, ParseFlags flags);
3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Quest(Regexp* sub, ParseFlags flags);
3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* NewLiteral(Rune rune, ParseFlags flags);
3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* HaveMatch(int match_id, ParseFlags flags);
3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Like Alternate but does not factor out common prefixes.
3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Debugging function.  Returns string format for regexp
4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // that makes structure clear.  Does NOT use regexp syntax.
4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string Dump();
4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Helper traversal class, defined fully in walker-inl.h.
4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  template<typename T> class Walker;
4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Compile to Prog.  See prog.h
4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Reverse prog expects to be run over text backward.
4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Construction and execution of prog will
4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // stay within approximately max_mem bytes of memory.
4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If max_mem <= 0, a reasonable default is used.
4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Prog* CompileToProg(int64 max_mem);
4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Prog* CompileToReverseProg(int64 max_mem);
4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Whether to expect this library to find exactly the same answer as PCRE
4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // obscure cases behave differently.  Technically this is more a property
4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // of the Prog than the Regexp, but the computation is much easier to do
4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // on the Regexp.  See mimics_pcre.cc for the exact conditions.
4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool MimicsPCRE();
4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Benchmarking function.
4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void NullWalk();
4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Whether every match of this regexp must be anchored and
4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // begin with a non-empty fixed string (perhaps after ASCII
4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // case-folding).  If so, returns the prefix and the sub-regexp that
4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // follows it.
4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Constructor allocates vectors as appropriate for operator.
4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  explicit Regexp(RegexpOp op, ParseFlags parse_flags);
4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Use Decref() instead of delete to release Regexps.
4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // This is private to catch deletes at compile time.
4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~Regexp();
4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Destroy();
4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool QuickDestroy();
4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Helpers for Parse.  Listed here so they can edit Regexps.
4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  class ParseState;
4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend class ParseState;
4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             RegexpStatus* status);
4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Helper for testing [sic].
4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Computes whether Regexp is already simple.
4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool ComputeSimple();
4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Constructor that generates a concatenation or alternation,
4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // enforcing the limit on the number of subexpressions for
4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // a particular Regexp.
4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                                   ParseFlags flags, bool can_factor);
4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns the leading string that re starts with.
4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The returned Rune* points into a piece of re,
4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // so it must not be used after the caller calls re->Decref().
4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Removes the first n leading runes from the beginning of re.
4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Edits re in place.
4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static void RemoveLeadingString(Regexp* re, int n);
4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns the leading regexp in re's top-level concatenation.
4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The returned Regexp* points at re or a sub-expression of re,
4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // so it must not be used after the caller calls re->Decref().
4712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* LeadingRegexp(Regexp* re);
4722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Removes LeadingRegexp(re) from re and returns the remainder.
4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Might edit re in place.
4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Regexp* RemoveLeadingRegexp(Regexp* re);
4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Simplifies an alternation of literal strings by factoring out
4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // common prefixes.
4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static int FactorAlternationRecursive(Regexp** sub, int nsub,
4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                                        ParseFlags flags, int maxdepth);
4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Is a == b?  Only efficient on regexps that have not been through
4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Simplify yet - the expansion of a kRegexpRepeat will make this
4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // take a long time.  Do not call on such regexps, hence private.
4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static bool Equal(Regexp* a, Regexp* b);
4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Allocate space for n sub-regexps.
4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void AllocSub(int n) {
4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (n < 0 || static_cast<uint16>(n) != n)
4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      LOG(FATAL) << "Cannot AllocSub " << n;
4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (n > 1)
4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      submany_ = new Regexp*[n];
4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    nsub_ = n;
4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Add Rune to LiteralString
4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void AddRuneToString(Rune r);
4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Swaps this with that, in place.
5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Swap(Regexp *that);
5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Operator.  See description of operators above.
5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // uint8 instead of RegexpOp to control space usage.
5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8 op_;
5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Is this regexp structure already simple
5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // (has it been returned by Simplify)?
5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // uint8 instead of bool to control space usage.
5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8 simple_;
5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Flags saved from parsing and used during execution.
5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // (Only FoldCase is used.)
5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // uint16 instead of ParseFlags to control space usage.
5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint16 parse_flags_;
5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Reference count.  Exists so that SimplifyRegexp can build
5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // regexp structures that are dags rather than trees to avoid
5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // exponential blowup in space requirements.
5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // uint16 to control space usage.
5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The standard regexp routines will never generate a
5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // ref greater than the maximum repeat count (100),
5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // but even so, Incref and Decref consult an overflow map
5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // when ref_ reaches kMaxRef.
5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint16 ref_;
5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const uint16 kMaxRef = 0xffff;
5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Subexpressions.
5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // uint16 to control space usage.
5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Concat and Alternate handle larger numbers of subexpressions
5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // by building concatenation or alternation trees.
5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Other routines should call Concat or Alternate instead of
5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // filling in sub() by hand.
5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint16 nsub_;
5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const uint16 kMaxNsub = 0xffff;
5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  union {
5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Regexp** submany_;  // if nsub_ > 1
5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Regexp* subone_;  // if nsub_ == 1
5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Extra space for parse and teardown stacks.
5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Regexp* down_;
5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Arguments to operator.  See description of operators above.
5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  union {
5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    struct {  // Repeat
5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int max_;
5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int min_;
5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    struct {  // Capture
5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int cap_;
5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      string* name_;
5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    struct {  // LiteralString
5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int nrunes_;
5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      Rune* runes_;
5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    struct {  // CharClass
5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // These two could be in separate union members,
5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // but it wouldn't save any space (there are other two-word structs)
5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // and keeping them separate avoids confusion during parsing.
5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      CharClass* cc_;
5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      CharClassBuilder* ccb_;
5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Rune rune_;  // Literal
5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int match_id_;  // HaveMatch
5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void *the_union_[2];  // as big as any other element, for memset
5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(Regexp);
5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Character class set: contains non-overlapping, non-abutting RuneRanges.
5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsontypedef set<RuneRange, RuneRangeLess> RuneRangeSet;
5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass CharClassBuilder {
5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClassBuilder();
5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  typedef RuneRangeSet::iterator iterator;
5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  iterator begin() { return ranges_.begin(); }
5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  iterator end() { return ranges_.end(); }
5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int size() { return nrunes_; }
5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool empty() { return nrunes_ == 0; }
5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool full() { return nrunes_ == Runemax+1; }
5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool Contains(Rune r);
5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool FoldsASCII();
5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClassBuilder* Copy();
5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void AddCharClass(CharClassBuilder* cc);
5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Negate();
5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void RemoveAbove(Rune r);
5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CharClass* GetCharClass();
5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const uint32 AlphaMask = (1<<26) - 1;
6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 upper_;  // bitmap of A-Z
6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 lower_;  // bitmap of a-z
6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nrunes_;
6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  RuneRangeSet ranges_;
6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsoninline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson{
6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsoninline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson{
6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsoninline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson{
6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsoninline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson{
6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif  // RE2_REGEXP_H__
633