1// Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression interface RE2.
6//
7// Originally the PCRE C++ wrapper, but adapted to use
8// the new automata-based regular expression engines.
9
10#include "re2/re2.h"
11
12#include <stdio.h>
13#include <string>
14#include <pthread.h>
15#include <errno.h>
16#include "util/util.h"
17#include "util/flags.h"
18#include "re2/prog.h"
19#include "re2/regexp.h"
20
21DEFINE_bool(trace_re2, false, "trace RE2 execution");
22
23namespace re2 {
24
25// Maximum number of args we can set
26static const int kMaxArgs = 16;
27static const int kVecSize = 1+kMaxArgs;
28
29const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
30const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
31const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
32const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
33
34// This will trigger LNK2005 error in MSVC.
35#ifndef COMPILER_MSVC
36const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
37#endif  // COMPILER_MSVC
38
39RE2::Options::Options(RE2::CannedOptions opt)
40  : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
41    posix_syntax_(opt == RE2::POSIX),
42    longest_match_(opt == RE2::POSIX),
43    log_errors_(opt != RE2::Quiet),
44    max_mem_(kDefaultMaxMem),
45    literal_(false),
46    never_nl_(false),
47    never_capture_(false),
48    case_sensitive_(true),
49    perl_classes_(false),
50    word_boundary_(false),
51    one_line_(false) {
52}
53
54// static empty things for use as const references.
55// To avoid global constructors, initialized on demand.
56GLOBAL_MUTEX(empty_mutex);
57static const string *empty_string;
58static const map<string, int> *empty_named_groups;
59static const map<int, string> *empty_group_names;
60
61static void InitEmpty() {
62  GLOBAL_MUTEX_LOCK(empty_mutex);
63  if (empty_string == NULL) {
64    empty_string = new string;
65    empty_named_groups = new map<string, int>;
66    empty_group_names = new map<int, string>;
67  }
68  GLOBAL_MUTEX_UNLOCK(empty_mutex);
69}
70
71// Converts from Regexp error code to RE2 error code.
72// Maybe some day they will diverge.  In any event, this
73// hides the existence of Regexp from RE2 users.
74static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
75  switch (code) {
76    case re2::kRegexpSuccess:
77      return RE2::NoError;
78    case re2::kRegexpInternalError:
79      return RE2::ErrorInternal;
80    case re2::kRegexpBadEscape:
81      return RE2::ErrorBadEscape;
82    case re2::kRegexpBadCharClass:
83      return RE2::ErrorBadCharClass;
84    case re2::kRegexpBadCharRange:
85      return RE2::ErrorBadCharRange;
86    case re2::kRegexpMissingBracket:
87      return RE2::ErrorMissingBracket;
88    case re2::kRegexpMissingParen:
89      return RE2::ErrorMissingParen;
90    case re2::kRegexpTrailingBackslash:
91      return RE2::ErrorTrailingBackslash;
92    case re2::kRegexpRepeatArgument:
93      return RE2::ErrorRepeatArgument;
94    case re2::kRegexpRepeatSize:
95      return RE2::ErrorRepeatSize;
96    case re2::kRegexpRepeatOp:
97      return RE2::ErrorRepeatOp;
98    case re2::kRegexpBadPerlOp:
99      return RE2::ErrorBadPerlOp;
100    case re2::kRegexpBadUTF8:
101      return RE2::ErrorBadUTF8;
102    case re2::kRegexpBadNamedCapture:
103      return RE2::ErrorBadNamedCapture;
104  }
105  return RE2::ErrorInternal;
106}
107
108static string trunc(const StringPiece& pattern) {
109  if (pattern.size() < 100)
110    return pattern.as_string();
111  return pattern.substr(0, 100).as_string() + "...";
112}
113
114
115RE2::RE2(const char* pattern) {
116  Init(pattern, DefaultOptions);
117}
118
119RE2::RE2(const string& pattern) {
120  Init(pattern, DefaultOptions);
121}
122
123RE2::RE2(const StringPiece& pattern) {
124  Init(pattern, DefaultOptions);
125}
126
127RE2::RE2(const StringPiece& pattern, const Options& options) {
128  Init(pattern, options);
129}
130
131int RE2::Options::ParseFlags() const {
132  int flags = Regexp::ClassNL;
133  switch (encoding()) {
134    default:
135      if (log_errors())
136        LOG(ERROR) << "Unknown encoding " << encoding();
137      break;
138    case RE2::Options::EncodingUTF8:
139      break;
140    case RE2::Options::EncodingLatin1:
141      flags |= Regexp::Latin1;
142      break;
143  }
144
145  if (!posix_syntax())
146    flags |= Regexp::LikePerl;
147
148  if (literal())
149    flags |= Regexp::Literal;
150
151  if (never_nl())
152    flags |= Regexp::NeverNL;
153
154  if (never_capture())
155    flags |= Regexp::NeverCapture;
156
157  if (!case_sensitive())
158    flags |= Regexp::FoldCase;
159
160  if (perl_classes())
161    flags |= Regexp::PerlClasses;
162
163  if (word_boundary())
164    flags |= Regexp::PerlB;
165
166  if (one_line())
167    flags |= Regexp::OneLine;
168
169  return flags;
170}
171
172void RE2::Init(const StringPiece& pattern, const Options& options) {
173  mutex_ = new Mutex;
174  pattern_ = pattern.as_string();
175  options_.Copy(options);
176  InitEmpty();
177  error_ = empty_string;
178  error_code_ = NoError;
179  suffix_regexp_ = NULL;
180  entire_regexp_ = NULL;
181  prog_ = NULL;
182  rprog_ = NULL;
183  named_groups_ = NULL;
184  group_names_ = NULL;
185  num_captures_ = -1;
186
187  RegexpStatus status;
188  entire_regexp_ = Regexp::Parse(
189    pattern_,
190    static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191    &status);
192  if (entire_regexp_ == NULL) {
193    if (error_ == empty_string)
194      error_ = new string(status.Text());
195    if (options_.log_errors()) {
196      LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
197                 << status.Text();
198    }
199    error_arg_ = status.error_arg().as_string();
200    error_code_ = RegexpErrorToRE2(status.code());
201    return;
202  }
203
204  prefix_.clear();
205  prefix_foldcase_ = false;
206  re2::Regexp* suffix;
207  if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
208    suffix_regexp_ = suffix;
209  else
210    suffix_regexp_ = entire_regexp_->Incref();
211
212  // Two thirds of the memory goes to the forward Prog,
213  // one third to the reverse prog, because the forward
214  // Prog has two DFAs but the reverse prog has one.
215  prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
216  if (prog_ == NULL) {
217    if (options_.log_errors())
218      LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
219    error_ = new string("pattern too large - compile failed");
220    error_code_ = RE2::ErrorPatternTooLarge;
221    return;
222  }
223
224  // Could delay this until the first match call that
225  // cares about submatch information, but the one-pass
226  // machine's memory gets cut from the DFA memory budget,
227  // and that is harder to do if the DFA has already
228  // been built.
229  is_one_pass_ = prog_->IsOnePass();
230}
231
232// Returns rprog_, computing it if needed.
233re2::Prog* RE2::ReverseProg() const {
234  MutexLock l(mutex_);
235  if (rprog_ == NULL && error_ == empty_string) {
236    rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
237    if (rprog_ == NULL) {
238      if (options_.log_errors())
239        LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
240      error_ = new string("pattern too large - reverse compile failed");
241      error_code_ = RE2::ErrorPatternTooLarge;
242      return NULL;
243    }
244  }
245  return rprog_;
246}
247
248RE2::~RE2() {
249  if (suffix_regexp_)
250    suffix_regexp_->Decref();
251  if (entire_regexp_)
252    entire_regexp_->Decref();
253  delete mutex_;
254  delete prog_;
255  delete rprog_;
256  if (error_ != empty_string)
257    delete error_;
258  if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259    delete named_groups_;
260  if (group_names_ != NULL &&  group_names_ != empty_group_names)
261    delete group_names_;
262}
263
264int RE2::ProgramSize() const {
265  if (prog_ == NULL)
266    return -1;
267  return prog_->size();
268}
269
270// Returns named_groups_, computing it if needed.
271const map<string, int>&  RE2::NamedCapturingGroups() const {
272  MutexLock l(mutex_);
273  if (!ok())
274    return *empty_named_groups;
275  if (named_groups_ == NULL) {
276    named_groups_ = suffix_regexp_->NamedCaptures();
277    if (named_groups_ == NULL)
278      named_groups_ = empty_named_groups;
279  }
280  return *named_groups_;
281}
282
283// Returns group_names_, computing it if needed.
284const map<int, string>&  RE2::CapturingGroupNames() const {
285  MutexLock l(mutex_);
286  if (!ok())
287    return *empty_group_names;
288  if (group_names_ == NULL) {
289    group_names_ = suffix_regexp_->CaptureNames();
290    if (group_names_ == NULL)
291      group_names_ = empty_group_names;
292  }
293  return *group_names_;
294}
295
296/***** Convenience interfaces *****/
297
298bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
299                     const Arg* const args[], int n) {
300  return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
301}
302
303bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
304                        const Arg* const args[], int n) {
305  return re.DoMatch(text, UNANCHORED, NULL, args, n);
306}
307
308bool RE2::ConsumeN(StringPiece* input, const RE2& re,
309                   const Arg* const args[], int n) {
310  int consumed;
311  if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
312    input->remove_prefix(consumed);
313    return true;
314  } else {
315    return false;
316  }
317}
318
319bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
320                          const Arg* const args[], int n) {
321  int consumed;
322  if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
323    input->remove_prefix(consumed);
324    return true;
325  } else {
326    return false;
327  }
328}
329
330// Returns the maximum submatch needed for the rewrite to be done by Replace().
331// E.g. if rewrite == "foo \\2,\\1", returns 2.
332int RE2::MaxSubmatch(const StringPiece& rewrite) {
333  int max = 0;
334  for (const char *s = rewrite.data(), *end = s + rewrite.size();
335       s < end; s++) {
336    if (*s == '\\') {
337      s++;
338      int c = (s < end) ? *s : -1;
339      if (isdigit(c)) {
340        int n = (c - '0');
341        if (n > max)
342          max = n;
343      }
344    }
345  }
346  return max;
347}
348
349bool RE2::Replace(string *str,
350                 const RE2& re,
351                 const StringPiece& rewrite) {
352  StringPiece vec[kVecSize];
353  int nvec = 1 + MaxSubmatch(rewrite);
354  if (nvec > arraysize(vec))
355    return false;
356  if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
357    return false;
358
359  string s;
360  if (!re.Rewrite(&s, rewrite, vec, nvec))
361    return false;
362
363  assert(vec[0].begin() >= str->data());
364  assert(vec[0].end() <= str->data()+str->size());
365  str->replace(vec[0].data() - str->data(), vec[0].size(), s);
366  return true;
367}
368
369int RE2::GlobalReplace(string *str,
370                      const RE2& re,
371                      const StringPiece& rewrite) {
372  StringPiece vec[kVecSize];
373  int nvec = 1 + MaxSubmatch(rewrite);
374  if (nvec > arraysize(vec))
375    return false;
376
377  const char* p = str->data();
378  const char* ep = p + str->size();
379  const char* lastend = NULL;
380  string out;
381  int count = 0;
382  while (p <= ep) {
383    if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
384      break;
385    if (p < vec[0].begin())
386      out.append(p, vec[0].begin() - p);
387    if (vec[0].begin() == lastend && vec[0].size() == 0) {
388      // Disallow empty match at end of last match: skip ahead.
389      if (p < ep)
390        out.append(p, 1);
391      p++;
392      continue;
393    }
394    re.Rewrite(&out, rewrite, vec, nvec);
395    p = vec[0].end();
396    lastend = p;
397    count++;
398  }
399
400  if (count == 0)
401    return 0;
402
403  if (p < ep)
404    out.append(p, ep - p);
405  swap(out, *str);
406  return count;
407}
408
409bool RE2::Extract(const StringPiece &text,
410                 const RE2& re,
411                 const StringPiece &rewrite,
412                 string *out) {
413  StringPiece vec[kVecSize];
414  int nvec = 1 + MaxSubmatch(rewrite);
415  if (nvec > arraysize(vec))
416    return false;
417
418  if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
419    return false;
420
421  out->clear();
422  return re.Rewrite(out, rewrite, vec, nvec);
423}
424
425string RE2::QuoteMeta(const StringPiece& unquoted) {
426  string result;
427  result.reserve(unquoted.size() << 1);
428
429  // Escape any ascii character not in [A-Za-z_0-9].
430  //
431  // Note that it's legal to escape a character even if it has no
432  // special meaning in a regular expression -- so this function does
433  // that.  (This also makes it identical to the perl function of the
434  // same name except for the null-character special case;
435  // see `perldoc -f quotemeta`.)
436  for (int ii = 0; ii < unquoted.length(); ++ii) {
437    // Note that using 'isalnum' here raises the benchmark time from
438    // 32ns to 58ns:
439    if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
440        (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
441        (unquoted[ii] < '0' || unquoted[ii] > '9') &&
442        unquoted[ii] != '_' &&
443        // If this is the part of a UTF8 or Latin1 character, we need
444        // to copy this byte without escaping.  Experimentally this is
445        // what works correctly with the regexp library.
446        !(unquoted[ii] & 128)) {
447      if (unquoted[ii] == '\0') {  // Special handling for null chars.
448        // Note that this special handling is not strictly required for RE2,
449        // but this quoting is required for other regexp libraries such as
450        // PCRE.
451        // Can't use "\\0" since the next character might be a digit.
452        result += "\\x00";
453        continue;
454      }
455      result += '\\';
456    }
457    result += unquoted[ii];
458  }
459
460  return result;
461}
462
463bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
464  if (prog_ == NULL)
465    return false;
466
467  int n = prefix_.size();
468  if (n > maxlen)
469    n = maxlen;
470
471  // Determine initial min max from prefix_ literal.
472  string pmin, pmax;
473  pmin = prefix_.substr(0, n);
474  pmax = prefix_.substr(0, n);
475  if (prefix_foldcase_) {
476    // prefix is ASCII lowercase; change pmin to uppercase.
477    for (int i = 0; i < n; i++) {
478      if ('a' <= pmin[i] && pmin[i] <= 'z')
479        pmin[i] += 'A' - 'a';
480    }
481  }
482
483  // Add to prefix min max using PossibleMatchRange on regexp.
484  string dmin, dmax;
485  maxlen -= n;
486  if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
487    pmin += dmin;
488    pmax += dmax;
489  } else if (pmax.size() > 0) {
490    // prog_->PossibleMatchRange has failed us,
491    // but we still have useful information from prefix_.
492    // Round up pmax to allow any possible suffix.
493    pmax = PrefixSuccessor(pmax);
494  } else {
495    // Nothing useful.
496    *min = "";
497    *max = "";
498    return false;
499  }
500
501  *min = pmin;
502  *max = pmax;
503  return true;
504}
505
506// Avoid possible locale nonsense in standard strcasecmp.
507// The string a is known to be all lowercase.
508static int ascii_strcasecmp(const char* a, const char* b, int len) {
509  const char *ae = a + len;
510
511  for (; a < ae; a++, b++) {
512    uint8 x = *a;
513    uint8 y = *b;
514    if ('A' <= y && y <= 'Z')
515      y += 'a' - 'A';
516    if (x != y)
517      return x - y;
518  }
519  return 0;
520}
521
522
523/***** Actual matching and rewriting code *****/
524
525bool RE2::Match(const StringPiece& text,
526                int startpos,
527                int endpos,
528                Anchor re_anchor,
529                StringPiece* submatch,
530                int nsubmatch) const {
531  if (!ok() || suffix_regexp_ == NULL) {
532    if (options_.log_errors())
533      LOG(ERROR) << "Invalid RE2: " << *error_;
534    return false;
535  }
536
537  if (startpos < 0 || startpos > endpos || endpos > text.size()) {
538    if (options_.log_errors())
539      LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
540    return false;
541  }
542
543  StringPiece subtext = text;
544  subtext.remove_prefix(startpos);
545  subtext.remove_suffix(text.size() - endpos);
546
547  // Use DFAs to find exact location of match, filter out non-matches.
548
549  // Don't ask for the location if we won't use it.
550  // SearchDFA can do extra optimizations in that case.
551  StringPiece match;
552  StringPiece* matchp = &match;
553  if (nsubmatch == 0)
554    matchp = NULL;
555
556  int ncap = 1 + NumberOfCapturingGroups();
557  if (ncap > nsubmatch)
558    ncap = nsubmatch;
559
560  // If the regexp is anchored explicitly, must not be in middle of text.
561  if (prog_->anchor_start() && startpos != 0)
562    return false;
563
564  // If the regexp is anchored explicitly, update re_anchor
565  // so that we can potentially fall into a faster case below.
566  if (prog_->anchor_start() && prog_->anchor_end())
567    re_anchor = ANCHOR_BOTH;
568  else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
569    re_anchor = ANCHOR_START;
570
571  // Check for the required prefix, if any.
572  int prefixlen = 0;
573  if (!prefix_.empty()) {
574    if (startpos != 0)
575      return false;
576    prefixlen = prefix_.size();
577    if (prefixlen > subtext.size())
578      return false;
579    if (prefix_foldcase_) {
580      if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
581        return false;
582    } else {
583      if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
584        return false;
585    }
586    subtext.remove_prefix(prefixlen);
587    // If there is a required prefix, the anchor must be at least ANCHOR_START.
588    if (re_anchor != ANCHOR_BOTH)
589      re_anchor = ANCHOR_START;
590  }
591
592  Prog::Anchor anchor = Prog::kUnanchored;
593  Prog::MatchKind kind = Prog::kFirstMatch;
594  if (options_.longest_match())
595    kind = Prog::kLongestMatch;
596  bool skipped_test = false;
597
598  bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
599
600  // SearchBitState allocates a bit vector of size prog_->size() * text.size().
601  // It also allocates a stack of 3-word structures which could potentially
602  // grow as large as prog_->size() * text.size() but in practice is much
603  // smaller.
604  // Conditions for using SearchBitState:
605  const int MaxBitStateProg = 500;   // prog_->size() <= Max.
606  const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
607  bool can_bit_state = prog_->size() <= MaxBitStateProg;
608  int bit_state_text_max = MaxBitStateVector / prog_->size();
609
610  bool dfa_failed = false;
611  switch (re_anchor) {
612    default:
613    case UNANCHORED: {
614      if (!prog_->SearchDFA(subtext, text, anchor, kind,
615                            matchp, &dfa_failed, NULL)) {
616        if (dfa_failed) {
617          // Fall back to NFA below.
618          skipped_test = true;
619          if (FLAGS_trace_re2)
620            LOG(INFO) << "Match " << trunc(pattern_)
621                      << " [" << CEscape(subtext) << "]"
622                      << " DFA failed.";
623          break;
624        }
625        if (FLAGS_trace_re2)
626          LOG(INFO) << "Match " << trunc(pattern_)
627                    << " [" << CEscape(subtext) << "]"
628                    << " used DFA - no match.";
629        return false;
630      }
631      if (FLAGS_trace_re2)
632        LOG(INFO) << "Match " << trunc(pattern_)
633                  << " [" << CEscape(subtext) << "]"
634                  << " used DFA - match";
635      if (matchp == NULL)  // Matched.  Don't care where
636        return true;
637      // SearchDFA set match[0].end() but didn't know where the
638      // match started.  Run the regexp backward from match[0].end()
639      // to find the longest possible match -- that's where it started.
640      Prog* prog = ReverseProg();
641      if (prog == NULL)
642        return false;
643      if (!prog->SearchDFA(match, text, Prog::kAnchored,
644                           Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
645        if (dfa_failed) {
646          // Fall back to NFA below.
647          skipped_test = true;
648          if (FLAGS_trace_re2)
649            LOG(INFO) << "Match " << trunc(pattern_)
650                      << " [" << CEscape(subtext) << "]"
651                      << " reverse DFA failed.";
652          break;
653        }
654        if (FLAGS_trace_re2)
655          LOG(INFO) << "Match " << trunc(pattern_)
656                    << " [" << CEscape(subtext) << "]"
657                    << " DFA inconsistency.";
658        if (options_.log_errors())
659          LOG(ERROR) << "DFA inconsistency";
660        return false;
661      }
662      if (FLAGS_trace_re2)
663        LOG(INFO) << "Match " << trunc(pattern_)
664                  << " [" << CEscape(subtext) << "]"
665                  << " used reverse DFA.";
666      break;
667    }
668
669    case ANCHOR_BOTH:
670    case ANCHOR_START:
671      if (re_anchor == ANCHOR_BOTH)
672        kind = Prog::kFullMatch;
673      anchor = Prog::kAnchored;
674
675      // If only a small amount of text and need submatch
676      // information anyway and we're going to use OnePass or BitState
677      // to get it, we might as well not even bother with the DFA:
678      // OnePass or BitState will be fast enough.
679      // On tiny texts, OnePass outruns even the DFA, and
680      // it doesn't have the shared state and occasional mutex that
681      // the DFA does.
682      if (can_one_pass && text.size() <= 4096 &&
683          (ncap > 1 || text.size() <= 8)) {
684        if (FLAGS_trace_re2)
685          LOG(INFO) << "Match " << trunc(pattern_)
686                    << " [" << CEscape(subtext) << "]"
687                    << " skipping DFA for OnePass.";
688        skipped_test = true;
689        break;
690      }
691      if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
692        if (FLAGS_trace_re2)
693          LOG(INFO) << "Match " << trunc(pattern_)
694                    << " [" << CEscape(subtext) << "]"
695                    << " skipping DFA for BitState.";
696        skipped_test = true;
697        break;
698      }
699      if (!prog_->SearchDFA(subtext, text, anchor, kind,
700                            &match, &dfa_failed, NULL)) {
701        if (dfa_failed) {
702          if (FLAGS_trace_re2)
703            LOG(INFO) << "Match " << trunc(pattern_)
704                      << " [" << CEscape(subtext) << "]"
705                      << " DFA failed.";
706          skipped_test = true;
707          break;
708        }
709        if (FLAGS_trace_re2)
710          LOG(INFO) << "Match " << trunc(pattern_)
711                    << " [" << CEscape(subtext) << "]"
712                    << " used DFA - no match.";
713        return false;
714      }
715      break;
716  }
717
718  if (!skipped_test && ncap <= 1) {
719    // We know exactly where it matches.  That's enough.
720    if (ncap == 1)
721      submatch[0] = match;
722  } else {
723    StringPiece subtext1;
724    if (skipped_test) {
725      // DFA ran out of memory or was skipped:
726      // need to search in entire original text.
727      subtext1 = subtext;
728    } else {
729      // DFA found the exact match location:
730      // let NFA run an anchored, full match search
731      // to find submatch locations.
732      subtext1 = match;
733      anchor = Prog::kAnchored;
734      kind = Prog::kFullMatch;
735    }
736
737    if (can_one_pass && anchor != Prog::kUnanchored) {
738      if (FLAGS_trace_re2)
739        LOG(INFO) << "Match " << trunc(pattern_)
740                  << " [" << CEscape(subtext) << "]"
741                  << " using OnePass.";
742      if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
743        if (!skipped_test && options_.log_errors())
744          LOG(ERROR) << "SearchOnePass inconsistency";
745        return false;
746      }
747    } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
748      if (FLAGS_trace_re2)
749        LOG(INFO) << "Match " << trunc(pattern_)
750                  << " [" << CEscape(subtext) << "]"
751                  << " using BitState.";
752      if (!prog_->SearchBitState(subtext1, text, anchor,
753                                 kind, submatch, ncap)) {
754        if (!skipped_test && options_.log_errors())
755          LOG(ERROR) << "SearchBitState inconsistency";
756        return false;
757      }
758    } else {
759      if (FLAGS_trace_re2)
760        LOG(INFO) << "Match " << trunc(pattern_)
761                  << " [" << CEscape(subtext) << "]"
762                  << " using NFA.";
763      if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
764        if (!skipped_test && options_.log_errors())
765          LOG(ERROR) << "SearchNFA inconsistency";
766        return false;
767      }
768    }
769  }
770
771  // Adjust overall match for required prefix that we stripped off.
772  if (prefixlen > 0 && nsubmatch > 0)
773    submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
774                              submatch[0].size() + prefixlen);
775
776  // Zero submatches that don't exist in the regexp.
777  for (int i = ncap; i < nsubmatch; i++)
778    submatch[i] = NULL;
779  return true;
780}
781
782// Internal matcher - like Match() but takes Args not StringPieces.
783bool RE2::DoMatch(const StringPiece& text,
784                  Anchor anchor,
785                  int* consumed,
786                  const Arg* const* args,
787                  int n) const {
788  if (!ok()) {
789    if (options_.log_errors())
790      LOG(ERROR) << "Invalid RE2: " << *error_;
791    return false;
792  }
793
794  // Count number of capture groups needed.
795  int nvec;
796  if (n == 0 && consumed == NULL)
797    nvec = 0;
798  else
799    nvec = n+1;
800
801  StringPiece* vec;
802  StringPiece stkvec[kVecSize];
803  StringPiece* heapvec = NULL;
804
805  if (nvec <= arraysize(stkvec)) {
806    vec = stkvec;
807  } else {
808    vec = new StringPiece[nvec];
809    heapvec = vec;
810  }
811
812  if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
813    delete[] heapvec;
814    return false;
815  }
816
817  if(consumed != NULL)
818    *consumed = vec[0].end() - text.begin();
819
820  if (n == 0 || args == NULL) {
821    // We are not interested in results
822    delete[] heapvec;
823    return true;
824  }
825
826  int ncap = NumberOfCapturingGroups();
827  if (ncap < n) {
828    // RE has fewer capturing groups than number of arg pointers passed in
829    VLOG(1) << "Asked for " << n << " but only have " << ncap;
830    delete[] heapvec;
831    return false;
832  }
833
834  // If we got here, we must have matched the whole pattern.
835  for (int i = 0; i < n; i++) {
836    const StringPiece& s = vec[i+1];
837    if (!args[i]->Parse(s.data(), s.size())) {
838      // TODO: Should we indicate what the error was?
839      VLOG(1) << "Parse error on #" << i << " " << s << " "
840	      << (void*)s.data() << "/" << s.size();
841      delete[] heapvec;
842      return false;
843    }
844  }
845
846  delete[] heapvec;
847  return true;
848}
849
850// Append the "rewrite" string, with backslash subsitutions from "vec",
851// to string "out".
852bool RE2::Rewrite(string *out, const StringPiece &rewrite,
853                 const StringPiece *vec, int veclen) const {
854  for (const char *s = rewrite.data(), *end = s + rewrite.size();
855       s < end; s++) {
856    int c = *s;
857    if (c == '\\') {
858      s++;
859      c = (s < end) ? *s : -1;
860      if (isdigit(c)) {
861        int n = (c - '0');
862        if (n >= veclen) {
863          if (options_.log_errors()) {
864            LOG(ERROR) << "requested group " << n
865                       << " in regexp " << rewrite.data();
866          }
867          return false;
868        }
869        StringPiece snip = vec[n];
870        if (snip.size() > 0)
871          out->append(snip.data(), snip.size());
872      } else if (c == '\\') {
873        out->push_back('\\');
874      } else {
875        if (options_.log_errors())
876          LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
877        return false;
878      }
879    } else {
880      out->push_back(c);
881    }
882  }
883  return true;
884}
885
886// Return the number of capturing subpatterns, or -1 if the
887// regexp wasn't valid on construction.
888int RE2::NumberOfCapturingGroups() const {
889  if (suffix_regexp_ == NULL)
890    return -1;
891  ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
892    " multiple threads end up doing the same work in parallel.");
893  if (num_captures_ == -1)
894    num_captures_ = suffix_regexp_->NumCaptures();
895  return num_captures_;
896}
897
898// Checks that the rewrite string is well-formed with respect to this
899// regular expression.
900bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
901  int max_token = -1;
902  for (const char *s = rewrite.data(), *end = s + rewrite.size();
903       s < end; s++) {
904    int c = *s;
905    if (c != '\\') {
906      continue;
907    }
908    if (++s == end) {
909      *error = "Rewrite schema error: '\\' not allowed at end.";
910      return false;
911    }
912    c = *s;
913    if (c == '\\') {
914      continue;
915    }
916    if (!isdigit(c)) {
917      *error = "Rewrite schema error: "
918               "'\\' must be followed by a digit or '\\'.";
919      return false;
920    }
921    int n = (c - '0');
922    if (max_token < n) {
923      max_token = n;
924    }
925  }
926
927  if (max_token > NumberOfCapturingGroups()) {
928    SStringPrintf(error, "Rewrite schema requests %d matches, "
929                  "but the regexp only has %d parenthesized subexpressions.",
930                  max_token, NumberOfCapturingGroups());
931    return false;
932  }
933  return true;
934}
935
936/***** Parsers for various types *****/
937
938bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
939  // We fail if somebody asked us to store into a non-NULL void* pointer
940  return (dest == NULL);
941}
942
943bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
944  if (dest == NULL) return true;
945  reinterpret_cast<string*>(dest)->assign(str, n);
946  return true;
947}
948
949bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
950  if (dest == NULL) return true;
951  reinterpret_cast<StringPiece*>(dest)->set(str, n);
952  return true;
953}
954
955bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
956  if (n != 1) return false;
957  if (dest == NULL) return true;
958  *(reinterpret_cast<char*>(dest)) = str[0];
959  return true;
960}
961
962bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
963  if (n != 1) return false;
964  if (dest == NULL) return true;
965  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
966  return true;
967}
968
969// Largest number spec that we are willing to parse
970static const int kMaxNumberLength = 32;
971
972// REQUIRES "buf" must have length at least kMaxNumberLength+1
973// Copies "str" into "buf" and null-terminates.
974// Overwrites *np with the new length.
975static const char* TerminateNumber(char* buf, const char* str, int* np) {
976  int n = *np;
977  if (n <= 0) return "";
978  if (n > 0 && isspace(*str)) {
979    // We are less forgiving than the strtoxxx() routines and do not
980    // allow leading spaces.
981    return "";
982  }
983
984  // Although buf has a fixed maximum size, we can still handle
985  // arbitrarily large integers correctly by omitting leading zeros.
986  // (Numbers that are still too long will be out of range.)
987  // Before deciding whether str is too long,
988  // remove leading zeros with s/000+/00/.
989  // Leaving the leading two zeros in place means that
990  // we don't change 0000x123 (invalid) into 0x123 (valid).
991  // Skip over leading - before replacing.
992  bool neg = false;
993  if (n >= 1 && str[0] == '-') {
994    neg = true;
995    n--;
996    str++;
997  }
998
999  if (n >= 3 && str[0] == '0' && str[1] == '0') {
1000    while (n >= 3 && str[2] == '0') {
1001      n--;
1002      str++;
1003    }
1004  }
1005
1006  if (neg) {  // make room in buf for -
1007    n++;
1008    str--;
1009  }
1010
1011  if (n > kMaxNumberLength) return "";
1012
1013  memmove(buf, str, n);
1014  if (neg) {
1015    buf[0] = '-';
1016  }
1017  buf[n] = '\0';
1018  *np = n;
1019  return buf;
1020}
1021
1022bool RE2::Arg::parse_long_radix(const char* str,
1023                               int n,
1024                               void* dest,
1025                               int radix) {
1026  if (n == 0) return false;
1027  char buf[kMaxNumberLength+1];
1028  str = TerminateNumber(buf, str, &n);
1029  char* end;
1030  errno = 0;
1031  long r = strtol(str, &end, radix);
1032  if (end != str + n) return false;   // Leftover junk
1033  if (errno) return false;
1034  if (dest == NULL) return true;
1035  *(reinterpret_cast<long*>(dest)) = r;
1036  return true;
1037}
1038
1039bool RE2::Arg::parse_ulong_radix(const char* str,
1040                                int n,
1041                                void* dest,
1042                                int radix) {
1043  if (n == 0) return false;
1044  char buf[kMaxNumberLength+1];
1045  str = TerminateNumber(buf, str, &n);
1046  if (str[0] == '-') {
1047   // strtoul() will silently accept negative numbers and parse
1048   // them.  This module is more strict and treats them as errors.
1049   return false;
1050  }
1051
1052  char* end;
1053  errno = 0;
1054  unsigned long r = strtoul(str, &end, radix);
1055  if (end != str + n) return false;   // Leftover junk
1056  if (errno) return false;
1057  if (dest == NULL) return true;
1058  *(reinterpret_cast<unsigned long*>(dest)) = r;
1059  return true;
1060}
1061
1062bool RE2::Arg::parse_short_radix(const char* str,
1063                                int n,
1064                                void* dest,
1065                                int radix) {
1066  long r;
1067  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1068  if ((short)r != r) return false;       // Out of range
1069  if (dest == NULL) return true;
1070  *(reinterpret_cast<short*>(dest)) = r;
1071  return true;
1072}
1073
1074bool RE2::Arg::parse_ushort_radix(const char* str,
1075                                 int n,
1076                                 void* dest,
1077                                 int radix) {
1078  unsigned long r;
1079  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1080  if ((ushort)r != r) return false;                      // Out of range
1081  if (dest == NULL) return true;
1082  *(reinterpret_cast<unsigned short*>(dest)) = r;
1083  return true;
1084}
1085
1086bool RE2::Arg::parse_int_radix(const char* str,
1087                              int n,
1088                              void* dest,
1089                              int radix) {
1090  long r;
1091  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1092  if ((int)r != r) return false;         // Out of range
1093  if (dest == NULL) return true;
1094  *(reinterpret_cast<int*>(dest)) = r;
1095  return true;
1096}
1097
1098bool RE2::Arg::parse_uint_radix(const char* str,
1099                               int n,
1100                               void* dest,
1101                               int radix) {
1102  unsigned long r;
1103  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1104  if ((uint)r != r) return false;                       // Out of range
1105  if (dest == NULL) return true;
1106  *(reinterpret_cast<unsigned int*>(dest)) = r;
1107  return true;
1108}
1109
1110bool RE2::Arg::parse_longlong_radix(const char* str,
1111                                   int n,
1112                                   void* dest,
1113                                   int radix) {
1114  if (n == 0) return false;
1115  char buf[kMaxNumberLength+1];
1116  str = TerminateNumber(buf, str, &n);
1117  char* end;
1118  errno = 0;
1119  int64 r = strtoll(str, &end, radix);
1120  if (end != str + n) return false;   // Leftover junk
1121  if (errno) return false;
1122  if (dest == NULL) return true;
1123  *(reinterpret_cast<int64*>(dest)) = r;
1124  return true;
1125}
1126
1127bool RE2::Arg::parse_ulonglong_radix(const char* str,
1128                                    int n,
1129                                    void* dest,
1130                                    int radix) {
1131  if (n == 0) return false;
1132  char buf[kMaxNumberLength+1];
1133  str = TerminateNumber(buf, str, &n);
1134  if (str[0] == '-') {
1135    // strtoull() will silently accept negative numbers and parse
1136    // them.  This module is more strict and treats them as errors.
1137    return false;
1138  }
1139  char* end;
1140  errno = 0;
1141  uint64 r = strtoull(str, &end, radix);
1142  if (end != str + n) return false;   // Leftover junk
1143  if (errno) return false;
1144  if (dest == NULL) return true;
1145  *(reinterpret_cast<uint64*>(dest)) = r;
1146  return true;
1147}
1148
1149static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1150  if (n == 0) return false;
1151  static const int kMaxLength = 200;
1152  char buf[kMaxLength];
1153  if (n >= kMaxLength) return false;
1154  memcpy(buf, str, n);
1155  buf[n] = '\0';
1156  errno = 0;
1157  char* end;
1158  double r;
1159  if (isfloat) {
1160    r = strtof(buf, &end);
1161  } else {
1162    r = strtod(buf, &end);
1163  }
1164  if (end != buf + n) return false;   // Leftover junk
1165  if (errno) return false;
1166  if (dest == NULL) return true;
1167  if (isfloat) {
1168    *(reinterpret_cast<float*>(dest)) = r;
1169  } else {
1170    *(reinterpret_cast<double*>(dest)) = r;
1171  }
1172  return true;
1173}
1174
1175bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1176  return parse_double_float(str, n, false, dest);
1177}
1178
1179bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1180  return parse_double_float(str, n, true, dest);
1181}
1182
1183
1184#define DEFINE_INTEGER_PARSERS(name)                                        \
1185  bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
1186    return parse_##name##_radix(str, n, dest, 10);                          \
1187  }                                                                         \
1188  bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
1189    return parse_##name##_radix(str, n, dest, 16);                          \
1190  }                                                                         \
1191  bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
1192    return parse_##name##_radix(str, n, dest, 8);                           \
1193  }                                                                         \
1194  bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1195    return parse_##name##_radix(str, n, dest, 0);                           \
1196  }
1197
1198DEFINE_INTEGER_PARSERS(short);
1199DEFINE_INTEGER_PARSERS(ushort);
1200DEFINE_INTEGER_PARSERS(int);
1201DEFINE_INTEGER_PARSERS(uint);
1202DEFINE_INTEGER_PARSERS(long);
1203DEFINE_INTEGER_PARSERS(ulong);
1204DEFINE_INTEGER_PARSERS(longlong);
1205DEFINE_INTEGER_PARSERS(ulonglong);
1206
1207#undef DEFINE_INTEGER_PARSERS
1208
1209}  // namespace re2
1210