re2.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1// Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression interface RE2.
6//
7// Originally the PCRE C++ wrapper, but adapted to use
8// the new automata-based regular expression engines.
9
10#include "re2/re2.h"
11
12#include <stdio.h>
13#include <string>
14#ifdef WIN32
15#define strtoll _strtoi64
16#define strtoull _strtoui64
17#define strtof strtod
18#else
19#include <pthread.h>
20#endif
21#include <errno.h>
22#include "util/util.h"
23#include "util/flags.h"
24#include "re2/prog.h"
25#include "re2/regexp.h"
26
27DEFINE_bool(trace_re2, false, "trace RE2 execution");
28
29namespace re2 {
30
31// Maximum number of args we can set
32static const int kMaxArgs = 16;
33static const int kVecSize = 1+kMaxArgs;
34
35const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {};
36const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {};
37const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};
38const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {};
39
40#define kDefaultMaxMem (8<<20)
41
42RE2::Options::Options()
43  :  encoding_(EncodingUTF8),
44     posix_syntax_(false),
45     longest_match_(false),
46     log_errors_(true),
47     max_mem_(kDefaultMaxMem),
48     literal_(false),
49     never_nl_(false),
50     never_capture_(false),
51     case_sensitive_(true),
52     perl_classes_(false),
53     word_boundary_(false),
54     one_line_(false) {
55}
56
57RE2::Options::Options(RE2::CannedOptions opt)
58  : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
59    posix_syntax_(opt == RE2::POSIX_SYNTAX),
60    longest_match_(opt == RE2::POSIX_SYNTAX),
61    log_errors_(opt != RE2::Quiet),
62    max_mem_(kDefaultMaxMem),
63    literal_(false),
64    never_nl_(false),
65    never_capture_(false),
66    case_sensitive_(true),
67    perl_classes_(false),
68    word_boundary_(false),
69    one_line_(false) {
70}
71
72// static empty things for use as const references.
73// To avoid global constructors, initialized on demand.
74GLOBAL_MUTEX(empty_mutex);
75static const string *empty_string;
76static const map<string, int> *empty_named_groups;
77static const map<int, string> *empty_group_names;
78
79static void InitEmpty() {
80  GLOBAL_MUTEX_LOCK(empty_mutex);
81  if (empty_string == NULL) {
82    empty_string = new string;
83    empty_named_groups = new map<string, int>;
84    empty_group_names = new map<int, string>;
85  }
86  GLOBAL_MUTEX_UNLOCK(empty_mutex);
87}
88
89// Converts from Regexp error code to RE2 error code.
90// Maybe some day they will diverge.  In any event, this
91// hides the existence of Regexp from RE2 users.
92static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
93  switch (code) {
94    case re2::kRegexpSuccess:
95      return RE2::NoError;
96    case re2::kRegexpInternalError:
97      return RE2::ErrorInternal;
98    case re2::kRegexpBadEscape:
99      return RE2::ErrorBadEscape;
100    case re2::kRegexpBadCharClass:
101      return RE2::ErrorBadCharClass;
102    case re2::kRegexpBadCharRange:
103      return RE2::ErrorBadCharRange;
104    case re2::kRegexpMissingBracket:
105      return RE2::ErrorMissingBracket;
106    case re2::kRegexpMissingParen:
107      return RE2::ErrorMissingParen;
108    case re2::kRegexpTrailingBackslash:
109      return RE2::ErrorTrailingBackslash;
110    case re2::kRegexpRepeatArgument:
111      return RE2::ErrorRepeatArgument;
112    case re2::kRegexpRepeatSize:
113      return RE2::ErrorRepeatSize;
114    case re2::kRegexpRepeatOp:
115      return RE2::ErrorRepeatOp;
116    case re2::kRegexpBadPerlOp:
117      return RE2::ErrorBadPerlOp;
118    case re2::kRegexpBadUTF8:
119      return RE2::ErrorBadUTF8;
120    case re2::kRegexpBadNamedCapture:
121      return RE2::ErrorBadNamedCapture;
122  }
123  return RE2::ErrorInternal;
124}
125
126static string trunc(const StringPiece& pattern) {
127  if (pattern.size() < 100)
128    return pattern.as_string();
129  return pattern.substr(0, 100).as_string() + "...";
130}
131
132
133RE2::RE2(const char* pattern) {
134  Init(pattern, DefaultOptions);
135}
136
137RE2::RE2(const string& pattern) {
138  Init(pattern, DefaultOptions);
139}
140
141RE2::RE2(const StringPiece& pattern) {
142  Init(pattern, DefaultOptions);
143}
144
145RE2::RE2(const StringPiece& pattern, const Options& options) {
146  Init(pattern, options);
147}
148
149int RE2::Options::ParseFlags() const {
150  int flags = Regexp::ClassNL;
151  switch (encoding()) {
152    default:
153      if (log_errors())
154        LOG(ERROR) << "Unknown encoding " << encoding();
155      break;
156    case RE2::Options::EncodingUTF8:
157      break;
158    case RE2::Options::EncodingLatin1:
159      flags |= Regexp::Latin1;
160      break;
161  }
162
163  if (!posix_syntax())
164    flags |= Regexp::LikePerl;
165
166  if (literal())
167    flags |= Regexp::Literal;
168
169  if (never_nl())
170    flags |= Regexp::NeverNL;
171
172  if (never_capture())
173    flags |= Regexp::NeverCapture;
174
175  if (!case_sensitive())
176    flags |= Regexp::FoldCase;
177
178  if (perl_classes())
179    flags |= Regexp::PerlClasses;
180
181  if (word_boundary())
182    flags |= Regexp::PerlB;
183
184  if (one_line())
185    flags |= Regexp::OneLine;
186
187  return flags;
188}
189
190void RE2::Init(const StringPiece& pattern, const Options& options) {
191  mutex_ = new Mutex;
192  pattern_ = pattern.as_string();
193  options_.Copy(options);
194  InitEmpty();
195  error_ = empty_string;
196  error_code_ = NoError;
197  suffix_regexp_ = NULL;
198  entire_regexp_ = NULL;
199  prog_ = NULL;
200  rprog_ = NULL;
201  named_groups_ = NULL;
202  group_names_ = NULL;
203  num_captures_ = -1;
204
205  RegexpStatus status;
206  entire_regexp_ = Regexp::Parse(
207    pattern_,
208    static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
209    &status);
210  if (entire_regexp_ == NULL) {
211    if (error_ == empty_string)
212      error_ = new string(status.Text());
213    if (options_.log_errors()) {
214      LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
215                 << status.Text();
216    }
217    error_arg_ = status.error_arg().as_string();
218    error_code_ = RegexpErrorToRE2(status.code());
219    return;
220  }
221
222  prefix_.clear();
223  prefix_foldcase_ = false;
224  re2::Regexp* suffix;
225  if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
226    suffix_regexp_ = suffix;
227  else
228    suffix_regexp_ = entire_regexp_->Incref();
229
230  // Two thirds of the memory goes to the forward Prog,
231  // one third to the reverse prog, because the forward
232  // Prog has two DFAs but the reverse prog has one.
233  prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
234  if (prog_ == NULL) {
235    if (options_.log_errors())
236      LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
237    error_ = new string("pattern too large - compile failed");
238    error_code_ = RE2::ErrorPatternTooLarge;
239    return;
240  }
241
242  // Could delay this until the first match call that
243  // cares about submatch information, but the one-pass
244  // machine's memory gets cut from the DFA memory budget,
245  // and that is harder to do if the DFA has already
246  // been built.
247  is_one_pass_ = prog_->IsOnePass();
248}
249
250// Returns rprog_, computing it if needed.
251re2::Prog* RE2::ReverseProg() const {
252  MutexLock l(mutex_);
253  if (rprog_ == NULL && error_ == empty_string) {
254    rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
255    if (rprog_ == NULL) {
256      if (options_.log_errors())
257        LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
258      error_ = new string("pattern too large - reverse compile failed");
259      error_code_ = RE2::ErrorPatternTooLarge;
260      return NULL;
261    }
262  }
263  return rprog_;
264}
265
266RE2::~RE2() {
267  if (suffix_regexp_)
268    suffix_regexp_->Decref();
269  if (entire_regexp_)
270    entire_regexp_->Decref();
271  delete mutex_;
272  delete prog_;
273  delete rprog_;
274  if (error_ != empty_string)
275    delete error_;
276  if (named_groups_ != NULL && named_groups_ != empty_named_groups)
277    delete named_groups_;
278  if (group_names_ != NULL &&  group_names_ != empty_group_names)
279    delete group_names_;
280}
281
282int RE2::ProgramSize() const {
283  if (prog_ == NULL)
284    return -1;
285  return prog_->size();
286}
287
288// Returns named_groups_, computing it if needed.
289const map<string, int>&  RE2::NamedCapturingGroups() const {
290  MutexLock l(mutex_);
291  if (!ok())
292    return *empty_named_groups;
293  if (named_groups_ == NULL) {
294    named_groups_ = suffix_regexp_->NamedCaptures();
295    if (named_groups_ == NULL)
296      named_groups_ = empty_named_groups;
297  }
298  return *named_groups_;
299}
300
301// Returns group_names_, computing it if needed.
302const map<int, string>&  RE2::CapturingGroupNames() const {
303  MutexLock l(mutex_);
304  if (!ok())
305    return *empty_group_names;
306  if (group_names_ == NULL) {
307    group_names_ = suffix_regexp_->CaptureNames();
308    if (group_names_ == NULL)
309      group_names_ = empty_group_names;
310  }
311  return *group_names_;
312}
313
314/***** Convenience interfaces *****/
315
316bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
317                     const Arg* const args[], int n) {
318  return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
319}
320
321bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
322                        const Arg* const args[], int n) {
323  return re.DoMatch(text, UNANCHORED, NULL, args, n);
324}
325
326bool RE2::ConsumeN(StringPiece* input, const RE2& re,
327                   const Arg* const args[], int n) {
328  int consumed;
329  if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
330    input->remove_prefix(consumed);
331    return true;
332  } else {
333    return false;
334  }
335}
336
337bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
338                          const Arg* const args[], int n) {
339  int consumed;
340  if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
341    input->remove_prefix(consumed);
342    return true;
343  } else {
344    return false;
345  }
346}
347
348// Returns the maximum submatch needed for the rewrite to be done by Replace().
349// E.g. if rewrite == "foo \\2,\\1", returns 2.
350int RE2::MaxSubmatch(const StringPiece& rewrite) {
351  int max = 0;
352  for (const char *s = rewrite.data(), *end = s + rewrite.size();
353       s < end; s++) {
354    if (*s == '\\') {
355      s++;
356      int c = (s < end) ? *s : -1;
357      if (isdigit(c)) {
358        int n = (c - '0');
359        if (n > max)
360          max = n;
361      }
362    }
363  }
364  return max;
365}
366
367bool RE2::Replace(string *str,
368                 const RE2& re,
369                 const StringPiece& rewrite) {
370  StringPiece vec[kVecSize];
371  int nvec = 1 + MaxSubmatch(rewrite);
372  if (nvec > arraysize(vec))
373    return false;
374  if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
375    return false;
376
377  string s;
378  if (!re.Rewrite(&s, rewrite, vec, nvec))
379    return false;
380
381  assert(vec[0].begin() >= str->data());
382  assert(vec[0].end() <= str->data()+str->size());
383  str->replace(vec[0].data() - str->data(), vec[0].size(), s);
384  return true;
385}
386
387int RE2::GlobalReplace(string *str,
388                      const RE2& re,
389                      const StringPiece& rewrite) {
390  StringPiece vec[kVecSize];
391  int nvec = 1 + MaxSubmatch(rewrite);
392  if (nvec > arraysize(vec))
393    return false;
394
395  const char* p = str->data();
396  const char* ep = p + str->size();
397  const char* lastend = NULL;
398  string out;
399  int count = 0;
400  while (p <= ep) {
401    if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
402      break;
403    if (p < vec[0].begin())
404      out.append(p, vec[0].begin() - p);
405    if (vec[0].begin() == lastend && vec[0].size() == 0) {
406      // Disallow empty match at end of last match: skip ahead.
407      if (p < ep)
408        out.append(p, 1);
409      p++;
410      continue;
411    }
412    re.Rewrite(&out, rewrite, vec, nvec);
413    p = vec[0].end();
414    lastend = p;
415    count++;
416  }
417
418  if (count == 0)
419    return 0;
420
421  if (p < ep)
422    out.append(p, ep - p);
423  swap(out, *str);
424  return count;
425}
426
427bool RE2::Extract(const StringPiece &text,
428                 const RE2& re,
429                 const StringPiece &rewrite,
430                 string *out) {
431  StringPiece vec[kVecSize];
432  int nvec = 1 + MaxSubmatch(rewrite);
433  if (nvec > arraysize(vec))
434    return false;
435
436  if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
437    return false;
438
439  out->clear();
440  return re.Rewrite(out, rewrite, vec, nvec);
441}
442
443string RE2::QuoteMeta(const StringPiece& unquoted) {
444  string result;
445  result.reserve(unquoted.size() << 1);
446
447  // Escape any ascii character not in [A-Za-z_0-9].
448  //
449  // Note that it's legal to escape a character even if it has no
450  // special meaning in a regular expression -- so this function does
451  // that.  (This also makes it identical to the perl function of the
452  // same name except for the null-character special case;
453  // see `perldoc -f quotemeta`.)
454  for (int ii = 0; ii < unquoted.length(); ++ii) {
455    // Note that using 'isalnum' here raises the benchmark time from
456    // 32ns to 58ns:
457    if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
458        (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
459        (unquoted[ii] < '0' || unquoted[ii] > '9') &&
460        unquoted[ii] != '_' &&
461        // If this is the part of a UTF8 or Latin1 character, we need
462        // to copy this byte without escaping.  Experimentally this is
463        // what works correctly with the regexp library.
464        !(unquoted[ii] & 128)) {
465      if (unquoted[ii] == '\0') {  // Special handling for null chars.
466        // Note that this special handling is not strictly required for RE2,
467        // but this quoting is required for other regexp libraries such as
468        // PCRE.
469        // Can't use "\\0" since the next character might be a digit.
470        result += "\\x00";
471        continue;
472      }
473      result += '\\';
474    }
475    result += unquoted[ii];
476  }
477
478  return result;
479}
480
481bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
482  if (prog_ == NULL)
483    return false;
484
485  int n = prefix_.size();
486  if (n > maxlen)
487    n = maxlen;
488
489  // Determine initial min max from prefix_ literal.
490  string pmin, pmax;
491  pmin = prefix_.substr(0, n);
492  pmax = prefix_.substr(0, n);
493  if (prefix_foldcase_) {
494    // prefix is ASCII lowercase; change pmin to uppercase.
495    for (int i = 0; i < n; i++) {
496      if ('a' <= pmin[i] && pmin[i] <= 'z')
497        pmin[i] += 'A' - 'a';
498    }
499  }
500
501  // Add to prefix min max using PossibleMatchRange on regexp.
502  string dmin, dmax;
503  maxlen -= n;
504  if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
505    pmin += dmin;
506    pmax += dmax;
507  } else if (pmax.size() > 0) {
508    // prog_->PossibleMatchRange has failed us,
509    // but we still have useful information from prefix_.
510    // Round up pmax to allow any possible suffix.
511    pmax = PrefixSuccessor(pmax);
512  } else {
513    // Nothing useful.
514    *min = "";
515    *max = "";
516    return false;
517  }
518
519  *min = pmin;
520  *max = pmax;
521  return true;
522}
523
524// Avoid possible locale nonsense in standard strcasecmp.
525// The string a is known to be all lowercase.
526static int ascii_strcasecmp(const char* a, const char* b, int len) {
527  const char *ae = a + len;
528
529  for (; a < ae; a++, b++) {
530    uint8 x = *a;
531    uint8 y = *b;
532    if ('A' <= y && y <= 'Z')
533      y += 'a' - 'A';
534    if (x != y)
535      return x - y;
536  }
537  return 0;
538}
539
540
541/***** Actual matching and rewriting code *****/
542
543bool RE2::Match(const StringPiece& text,
544                int startpos,
545                int endpos,
546                Anchor re_anchor,
547                StringPiece* submatch,
548                int nsubmatch) const {
549  if (!ok() || suffix_regexp_ == NULL) {
550    if (options_.log_errors())
551      LOG(ERROR) << "Invalid RE2: " << *error_;
552    return false;
553  }
554
555  if (startpos < 0 || startpos > endpos || endpos > text.size()) {
556    if (options_.log_errors())
557      LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
558    return false;
559  }
560
561  StringPiece subtext = text;
562  subtext.remove_prefix(startpos);
563  subtext.remove_suffix(text.size() - endpos);
564
565  // Use DFAs to find exact location of match, filter out non-matches.
566
567  // Don't ask for the location if we won't use it.
568  // SearchDFA can do extra optimizations in that case.
569  StringPiece match;
570  StringPiece* matchp = &match;
571  if (nsubmatch == 0)
572    matchp = NULL;
573
574  int ncap = 1 + NumberOfCapturingGroups();
575  if (ncap > nsubmatch)
576    ncap = nsubmatch;
577
578  // If the regexp is anchored explicitly, must not be in middle of text.
579  if (prog_->anchor_start() && startpos != 0)
580    return false;
581
582  // If the regexp is anchored explicitly, update re_anchor
583  // so that we can potentially fall into a faster case below.
584  if (prog_->anchor_start() && prog_->anchor_end())
585    re_anchor = ANCHOR_BOTH;
586  else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
587    re_anchor = ANCHOR_START;
588
589  // Check for the required prefix, if any.
590  int prefixlen = 0;
591  if (!prefix_.empty()) {
592    if (startpos != 0)
593      return false;
594    prefixlen = prefix_.size();
595    if (prefixlen > subtext.size())
596      return false;
597    if (prefix_foldcase_) {
598      if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
599        return false;
600    } else {
601      if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
602        return false;
603    }
604    subtext.remove_prefix(prefixlen);
605    // If there is a required prefix, the anchor must be at least ANCHOR_START.
606    if (re_anchor != ANCHOR_BOTH)
607      re_anchor = ANCHOR_START;
608  }
609
610  Prog::Anchor anchor = Prog::kUnanchored;
611  Prog::MatchKind kind = Prog::kFirstMatch;
612  if (options_.longest_match())
613    kind = Prog::kLongestMatch;
614  bool skipped_test = false;
615
616  bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
617
618  // SearchBitState allocates a bit vector of size prog_->size() * text.size().
619  // It also allocates a stack of 3-word structures which could potentially
620  // grow as large as prog_->size() * text.size() but in practice is much
621  // smaller.
622  // Conditions for using SearchBitState:
623  const int MaxBitStateProg = 500;   // prog_->size() <= Max.
624  const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
625  bool can_bit_state = prog_->size() <= MaxBitStateProg;
626  int bit_state_text_max = MaxBitStateVector / prog_->size();
627
628  bool dfa_failed = false;
629  switch (re_anchor) {
630    default:
631    case UNANCHORED: {
632      if (!prog_->SearchDFA(subtext, text, anchor, kind,
633                            matchp, &dfa_failed, NULL)) {
634        if (dfa_failed) {
635          // Fall back to NFA below.
636          skipped_test = true;
637          if (FLAGS_trace_re2)
638            LOG(INFO) << "Match " << trunc(pattern_)
639                      << " [" << CEscape(subtext) << "]"
640                      << " DFA failed.";
641          break;
642        }
643        if (FLAGS_trace_re2)
644          LOG(INFO) << "Match " << trunc(pattern_)
645                    << " [" << CEscape(subtext) << "]"
646                    << " used DFA - no match.";
647        return false;
648      }
649      if (FLAGS_trace_re2)
650        LOG(INFO) << "Match " << trunc(pattern_)
651                  << " [" << CEscape(subtext) << "]"
652                  << " used DFA - match";
653      if (matchp == NULL)  // Matched.  Don't care where
654        return true;
655      // SearchDFA set match[0].end() but didn't know where the
656      // match started.  Run the regexp backward from match[0].end()
657      // to find the longest possible match -- that's where it started.
658      Prog* prog = ReverseProg();
659      if (prog == NULL)
660        return false;
661      if (!prog->SearchDFA(match, text, Prog::kAnchored,
662                           Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
663        if (dfa_failed) {
664          // Fall back to NFA below.
665          skipped_test = true;
666          if (FLAGS_trace_re2)
667            LOG(INFO) << "Match " << trunc(pattern_)
668                      << " [" << CEscape(subtext) << "]"
669                      << " reverse DFA failed.";
670          break;
671        }
672        if (FLAGS_trace_re2)
673          LOG(INFO) << "Match " << trunc(pattern_)
674                    << " [" << CEscape(subtext) << "]"
675                    << " DFA inconsistency.";
676        if (options_.log_errors())
677          LOG(ERROR) << "DFA inconsistency";
678        return false;
679      }
680      if (FLAGS_trace_re2)
681        LOG(INFO) << "Match " << trunc(pattern_)
682                  << " [" << CEscape(subtext) << "]"
683                  << " used reverse DFA.";
684      break;
685    }
686
687    case ANCHOR_BOTH:
688    case ANCHOR_START:
689      if (re_anchor == ANCHOR_BOTH)
690        kind = Prog::kFullMatch;
691      anchor = Prog::kAnchored;
692
693      // If only a small amount of text and need submatch
694      // information anyway and we're going to use OnePass or BitState
695      // to get it, we might as well not even bother with the DFA:
696      // OnePass or BitState will be fast enough.
697      // On tiny texts, OnePass outruns even the DFA, and
698      // it doesn't have the shared state and occasional mutex that
699      // the DFA does.
700      if (can_one_pass && text.size() <= 4096 &&
701          (ncap > 1 || text.size() <= 8)) {
702        if (FLAGS_trace_re2)
703          LOG(INFO) << "Match " << trunc(pattern_)
704                    << " [" << CEscape(subtext) << "]"
705                    << " skipping DFA for OnePass.";
706        skipped_test = true;
707        break;
708      }
709      if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
710        if (FLAGS_trace_re2)
711          LOG(INFO) << "Match " << trunc(pattern_)
712                    << " [" << CEscape(subtext) << "]"
713                    << " skipping DFA for BitState.";
714        skipped_test = true;
715        break;
716      }
717      if (!prog_->SearchDFA(subtext, text, anchor, kind,
718                            &match, &dfa_failed, NULL)) {
719        if (dfa_failed) {
720          if (FLAGS_trace_re2)
721            LOG(INFO) << "Match " << trunc(pattern_)
722                      << " [" << CEscape(subtext) << "]"
723                      << " DFA failed.";
724          skipped_test = true;
725          break;
726        }
727        if (FLAGS_trace_re2)
728          LOG(INFO) << "Match " << trunc(pattern_)
729                    << " [" << CEscape(subtext) << "]"
730                    << " used DFA - no match.";
731        return false;
732      }
733      break;
734  }
735
736  if (!skipped_test && ncap <= 1) {
737    // We know exactly where it matches.  That's enough.
738    if (ncap == 1)
739      submatch[0] = match;
740  } else {
741    StringPiece subtext1;
742    if (skipped_test) {
743      // DFA ran out of memory or was skipped:
744      // need to search in entire original text.
745      subtext1 = subtext;
746    } else {
747      // DFA found the exact match location:
748      // let NFA run an anchored, full match search
749      // to find submatch locations.
750      subtext1 = match;
751      anchor = Prog::kAnchored;
752      kind = Prog::kFullMatch;
753    }
754
755    if (can_one_pass && anchor != Prog::kUnanchored) {
756      if (FLAGS_trace_re2)
757        LOG(INFO) << "Match " << trunc(pattern_)
758                  << " [" << CEscape(subtext) << "]"
759                  << " using OnePass.";
760      if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
761        if (!skipped_test && options_.log_errors())
762          LOG(ERROR) << "SearchOnePass inconsistency";
763        return false;
764      }
765    } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
766      if (FLAGS_trace_re2)
767        LOG(INFO) << "Match " << trunc(pattern_)
768                  << " [" << CEscape(subtext) << "]"
769                  << " using BitState.";
770      if (!prog_->SearchBitState(subtext1, text, anchor,
771                                 kind, submatch, ncap)) {
772        if (!skipped_test && options_.log_errors())
773          LOG(ERROR) << "SearchBitState inconsistency";
774        return false;
775      }
776    } else {
777      if (FLAGS_trace_re2)
778        LOG(INFO) << "Match " << trunc(pattern_)
779                  << " [" << CEscape(subtext) << "]"
780                  << " using NFA.";
781      if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
782        if (!skipped_test && options_.log_errors())
783          LOG(ERROR) << "SearchNFA inconsistency";
784        return false;
785      }
786    }
787  }
788
789  // Adjust overall match for required prefix that we stripped off.
790  if (prefixlen > 0 && nsubmatch > 0)
791    submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
792                              submatch[0].size() + prefixlen);
793
794  // Zero submatches that don't exist in the regexp.
795  for (int i = ncap; i < nsubmatch; i++)
796    submatch[i] = NULL;
797  return true;
798}
799
800// Internal matcher - like Match() but takes Args not StringPieces.
801bool RE2::DoMatch(const StringPiece& text,
802                  Anchor anchor,
803                  int* consumed,
804                  const Arg* const* args,
805                  int n) const {
806  if (!ok()) {
807    if (options_.log_errors())
808      LOG(ERROR) << "Invalid RE2: " << *error_;
809    return false;
810  }
811
812  // Count number of capture groups needed.
813  int nvec;
814  if (n == 0 && consumed == NULL)
815    nvec = 0;
816  else
817    nvec = n+1;
818
819  StringPiece* vec;
820  StringPiece stkvec[kVecSize];
821  StringPiece* heapvec = NULL;
822
823  if (nvec <= arraysize(stkvec)) {
824    vec = stkvec;
825  } else {
826    vec = new StringPiece[nvec];
827    heapvec = vec;
828  }
829
830  if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
831    delete[] heapvec;
832    return false;
833  }
834
835  if(consumed != NULL)
836    *consumed = vec[0].end() - text.begin();
837
838  if (n == 0 || args == NULL) {
839    // We are not interested in results
840    delete[] heapvec;
841    return true;
842  }
843
844  int ncap = NumberOfCapturingGroups();
845  if (ncap < n) {
846    // RE has fewer capturing groups than number of arg pointers passed in
847    VLOG(1) << "Asked for " << n << " but only have " << ncap;
848    delete[] heapvec;
849    return false;
850  }
851
852  // If we got here, we must have matched the whole pattern.
853  for (int i = 0; i < n; i++) {
854    const StringPiece& s = vec[i+1];
855    if (!args[i]->Parse(s.data(), s.size())) {
856      // TODO: Should we indicate what the error was?
857      VLOG(1) << "Parse error on #" << i << " " << s << " "
858	      << (void*)s.data() << "/" << s.size();
859      delete[] heapvec;
860      return false;
861    }
862  }
863
864  delete[] heapvec;
865  return true;
866}
867
868// Append the "rewrite" string, with backslash subsitutions from "vec",
869// to string "out".
870bool RE2::Rewrite(string *out, const StringPiece &rewrite,
871                 const StringPiece *vec, int veclen) const {
872  for (const char *s = rewrite.data(), *end = s + rewrite.size();
873       s < end; s++) {
874    int c = *s;
875    if (c == '\\') {
876      s++;
877      c = (s < end) ? *s : -1;
878      if (isdigit(c)) {
879        int n = (c - '0');
880        if (n >= veclen) {
881          if (options_.log_errors()) {
882            LOG(ERROR) << "requested group " << n
883                       << " in regexp " << rewrite.data();
884          }
885          return false;
886        }
887        StringPiece snip = vec[n];
888        if (snip.size() > 0)
889          out->append(snip.data(), snip.size());
890      } else if (c == '\\') {
891        out->push_back('\\');
892      } else {
893        if (options_.log_errors())
894          LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
895        return false;
896      }
897    } else {
898      out->push_back(c);
899    }
900  }
901  return true;
902}
903
904// Return the number of capturing subpatterns, or -1 if the
905// regexp wasn't valid on construction.
906int RE2::NumberOfCapturingGroups() const {
907  if (suffix_regexp_ == NULL)
908    return -1;
909  ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
910    " multiple threads end up doing the same work in parallel.");
911  if (num_captures_ == -1)
912    num_captures_ = suffix_regexp_->NumCaptures();
913  return num_captures_;
914}
915
916// Checks that the rewrite string is well-formed with respect to this
917// regular expression.
918bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
919  int max_token = -1;
920  for (const char *s = rewrite.data(), *end = s + rewrite.size();
921       s < end; s++) {
922    int c = *s;
923    if (c != '\\') {
924      continue;
925    }
926    if (++s == end) {
927      *error = "Rewrite schema error: '\\' not allowed at end.";
928      return false;
929    }
930    c = *s;
931    if (c == '\\') {
932      continue;
933    }
934    if (!isdigit(c)) {
935      *error = "Rewrite schema error: "
936               "'\\' must be followed by a digit or '\\'.";
937      return false;
938    }
939    int n = (c - '0');
940    if (max_token < n) {
941      max_token = n;
942    }
943  }
944
945  if (max_token > NumberOfCapturingGroups()) {
946    SStringPrintf(error, "Rewrite schema requests %d matches, "
947                  "but the regexp only has %d parenthesized subexpressions.",
948                  max_token, NumberOfCapturingGroups());
949    return false;
950  }
951  return true;
952}
953
954/***** Parsers for various types *****/
955
956bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
957  // We fail if somebody asked us to store into a non-NULL void* pointer
958  return (dest == NULL);
959}
960
961bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
962  if (dest == NULL) return true;
963  reinterpret_cast<string*>(dest)->assign(str, n);
964  return true;
965}
966
967bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
968  if (dest == NULL) return true;
969  reinterpret_cast<StringPiece*>(dest)->set(str, n);
970  return true;
971}
972
973bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
974  if (n != 1) return false;
975  if (dest == NULL) return true;
976  *(reinterpret_cast<char*>(dest)) = str[0];
977  return true;
978}
979
980bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
981  if (n != 1) return false;
982  if (dest == NULL) return true;
983  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
984  return true;
985}
986
987// Largest number spec that we are willing to parse
988static const int kMaxNumberLength = 32;
989
990// REQUIRES "buf" must have length at least kMaxNumberLength+1
991// Copies "str" into "buf" and null-terminates.
992// Overwrites *np with the new length.
993static const char* TerminateNumber(char* buf, const char* str, int* np) {
994  int n = *np;
995  if (n <= 0) return "";
996  if (n > 0 && isspace(*str)) {
997    // We are less forgiving than the strtoxxx() routines and do not
998    // allow leading spaces.
999    return "";
1000  }
1001
1002  // Although buf has a fixed maximum size, we can still handle
1003  // arbitrarily large integers correctly by omitting leading zeros.
1004  // (Numbers that are still too long will be out of range.)
1005  // Before deciding whether str is too long,
1006  // remove leading zeros with s/000+/00/.
1007  // Leaving the leading two zeros in place means that
1008  // we don't change 0000x123 (invalid) into 0x123 (valid).
1009  // Skip over leading - before replacing.
1010  bool neg = false;
1011  if (n >= 1 && str[0] == '-') {
1012    neg = true;
1013    n--;
1014    str++;
1015  }
1016
1017  if (n >= 3 && str[0] == '0' && str[1] == '0') {
1018    while (n >= 3 && str[2] == '0') {
1019      n--;
1020      str++;
1021    }
1022  }
1023
1024  if (neg) {  // make room in buf for -
1025    n++;
1026    str--;
1027  }
1028
1029  if (n > kMaxNumberLength) return "";
1030
1031  memmove(buf, str, n);
1032  if (neg) {
1033    buf[0] = '-';
1034  }
1035  buf[n] = '\0';
1036  *np = n;
1037  return buf;
1038}
1039
1040bool RE2::Arg::parse_long_radix(const char* str,
1041                               int n,
1042                               void* dest,
1043                               int radix) {
1044  if (n == 0) return false;
1045  char buf[kMaxNumberLength+1];
1046  str = TerminateNumber(buf, str, &n);
1047  char* end;
1048  errno = 0;
1049  long r = strtol(str, &end, radix);
1050  if (end != str + n) return false;   // Leftover junk
1051  if (errno) return false;
1052  if (dest == NULL) return true;
1053  *(reinterpret_cast<long*>(dest)) = r;
1054  return true;
1055}
1056
1057bool RE2::Arg::parse_ulong_radix(const char* str,
1058                                int n,
1059                                void* dest,
1060                                int radix) {
1061  if (n == 0) return false;
1062  char buf[kMaxNumberLength+1];
1063  str = TerminateNumber(buf, str, &n);
1064  if (str[0] == '-') {
1065   // strtoul() will silently accept negative numbers and parse
1066   // them.  This module is more strict and treats them as errors.
1067   return false;
1068  }
1069
1070  char* end;
1071  errno = 0;
1072  unsigned long r = strtoul(str, &end, radix);
1073  if (end != str + n) return false;   // Leftover junk
1074  if (errno) return false;
1075  if (dest == NULL) return true;
1076  *(reinterpret_cast<unsigned long*>(dest)) = r;
1077  return true;
1078}
1079
1080bool RE2::Arg::parse_short_radix(const char* str,
1081                                int n,
1082                                void* dest,
1083                                int radix) {
1084  long r;
1085  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1086  if ((short)r != r) return false;       // Out of range
1087  if (dest == NULL) return true;
1088  *(reinterpret_cast<short*>(dest)) = r;
1089  return true;
1090}
1091
1092bool RE2::Arg::parse_ushort_radix(const char* str,
1093                                 int n,
1094                                 void* dest,
1095                                 int radix) {
1096  unsigned long r;
1097  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1098  if ((ushort)r != r) return false;                      // Out of range
1099  if (dest == NULL) return true;
1100  *(reinterpret_cast<unsigned short*>(dest)) = r;
1101  return true;
1102}
1103
1104bool RE2::Arg::parse_int_radix(const char* str,
1105                              int n,
1106                              void* dest,
1107                              int radix) {
1108  long r;
1109  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1110  if ((int)r != r) return false;         // Out of range
1111  if (dest == NULL) return true;
1112  *(reinterpret_cast<int*>(dest)) = r;
1113  return true;
1114}
1115
1116bool RE2::Arg::parse_uint_radix(const char* str,
1117                               int n,
1118                               void* dest,
1119                               int radix) {
1120  unsigned long r;
1121  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1122  if ((uint)r != r) return false;                       // Out of range
1123  if (dest == NULL) return true;
1124  *(reinterpret_cast<unsigned int*>(dest)) = r;
1125  return true;
1126}
1127
1128bool RE2::Arg::parse_longlong_radix(const char* str,
1129                                   int n,
1130                                   void* dest,
1131                                   int radix) {
1132  if (n == 0) return false;
1133  char buf[kMaxNumberLength+1];
1134  str = TerminateNumber(buf, str, &n);
1135  char* end;
1136  errno = 0;
1137  int64 r = strtoll(str, &end, radix);
1138  if (end != str + n) return false;   // Leftover junk
1139  if (errno) return false;
1140  if (dest == NULL) return true;
1141  *(reinterpret_cast<int64*>(dest)) = r;
1142  return true;
1143}
1144
1145bool RE2::Arg::parse_ulonglong_radix(const char* str,
1146                                    int n,
1147                                    void* dest,
1148                                    int radix) {
1149  if (n == 0) return false;
1150  char buf[kMaxNumberLength+1];
1151  str = TerminateNumber(buf, str, &n);
1152  if (str[0] == '-') {
1153    // strtoull() will silently accept negative numbers and parse
1154    // them.  This module is more strict and treats them as errors.
1155    return false;
1156  }
1157  char* end;
1158  errno = 0;
1159  uint64 r = strtoull(str, &end, radix);
1160  if (end != str + n) return false;   // Leftover junk
1161  if (errno) return false;
1162  if (dest == NULL) return true;
1163  *(reinterpret_cast<uint64*>(dest)) = r;
1164  return true;
1165}
1166
1167static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1168  if (n == 0) return false;
1169  static const int kMaxLength = 200;
1170  char buf[kMaxLength];
1171  if (n >= kMaxLength) return false;
1172  memcpy(buf, str, n);
1173  buf[n] = '\0';
1174  errno = 0;
1175  char* end;
1176  double r;
1177  if (isfloat) {
1178    r = strtof(buf, &end);
1179  } else {
1180    r = strtod(buf, &end);
1181  }
1182  if (end != buf + n) return false;   // Leftover junk
1183  if (errno) return false;
1184  if (dest == NULL) return true;
1185  if (isfloat) {
1186    *(reinterpret_cast<float*>(dest)) = r;
1187  } else {
1188    *(reinterpret_cast<double*>(dest)) = r;
1189  }
1190  return true;
1191}
1192
1193bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1194  return parse_double_float(str, n, false, dest);
1195}
1196
1197bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1198  return parse_double_float(str, n, true, dest);
1199}
1200
1201
1202#define DEFINE_INTEGER_PARSERS(name)                                        \
1203  bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
1204    return parse_##name##_radix(str, n, dest, 10);                          \
1205  }                                                                         \
1206  bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
1207    return parse_##name##_radix(str, n, dest, 16);                          \
1208  }                                                                         \
1209  bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
1210    return parse_##name##_radix(str, n, dest, 8);                           \
1211  }                                                                         \
1212  bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1213    return parse_##name##_radix(str, n, dest, 0);                           \
1214  }
1215
1216DEFINE_INTEGER_PARSERS(short);
1217DEFINE_INTEGER_PARSERS(ushort);
1218DEFINE_INTEGER_PARSERS(int);
1219DEFINE_INTEGER_PARSERS(uint);
1220DEFINE_INTEGER_PARSERS(long);
1221DEFINE_INTEGER_PARSERS(ulong);
1222DEFINE_INTEGER_PARSERS(longlong);
1223DEFINE_INTEGER_PARSERS(ulonglong);
1224
1225#undef DEFINE_INTEGER_PARSERS
1226
1227}  // namespace re2
1228