1// Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression interface RE2.
6//
7// Originally the PCRE C++ wrapper, but adapted to use
8// the new automata-based regular expression engines.
9
10#include "re2/re2.h"
11
12#include <stdio.h>
13#include <ctype.h>
14#include <string>
15#include <pthread.h>
16#include <errno.h>
17#include "util/util.h"
18#include "util/flags.h"
19#include "re2/prog.h"
20#include "re2/regexp.h"
21
22DEFINE_bool(trace_re2, false, "trace RE2 execution");
23
24namespace re2 {
25
26// Maximum number of args we can set
27static const int kMaxArgs = 16;
28static const int kVecSize = 1+kMaxArgs;
29
30const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
31const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
32const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
33const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
34
35const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
36
37// Commonly-used option sets; arguments to constructor are:
38//   utf8 input
39//   posix syntax
40//   longest match
41//   log errors
42const RE2::Options RE2::DefaultOptions;  // EncodingUTF8, false, false, true
43const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
44const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
45const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
46
47// If a regular expression has no error, its error_ field points here
48static const string empty_string;
49
50// Converts from Regexp error code to RE2 error code.
51// Maybe some day they will diverge.  In any event, this
52// hides the existence of Regexp from RE2 users.
53static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
54  switch (code) {
55    case re2::kRegexpSuccess:
56      return RE2::NoError;
57    case re2::kRegexpInternalError:
58      return RE2::ErrorInternal;
59    case re2::kRegexpBadEscape:
60      return RE2::ErrorBadEscape;
61    case re2::kRegexpBadCharClass:
62      return RE2::ErrorBadCharClass;
63    case re2::kRegexpBadCharRange:
64      return RE2::ErrorBadCharRange;
65    case re2::kRegexpMissingBracket:
66      return RE2::ErrorMissingBracket;
67    case re2::kRegexpMissingParen:
68      return RE2::ErrorMissingParen;
69    case re2::kRegexpTrailingBackslash:
70      return RE2::ErrorTrailingBackslash;
71    case re2::kRegexpRepeatArgument:
72      return RE2::ErrorRepeatArgument;
73    case re2::kRegexpRepeatSize:
74      return RE2::ErrorRepeatSize;
75    case re2::kRegexpRepeatOp:
76      return RE2::ErrorRepeatOp;
77    case re2::kRegexpBadPerlOp:
78      return RE2::ErrorBadPerlOp;
79    case re2::kRegexpBadUTF8:
80      return RE2::ErrorBadUTF8;
81    case re2::kRegexpBadNamedCapture:
82      return RE2::ErrorBadNamedCapture;
83  }
84  return RE2::ErrorInternal;
85}
86
87static string trunc(const StringPiece& pattern) {
88  if (pattern.size() < 100)
89    return pattern.as_string();
90  return pattern.substr(0, 100).as_string() + "...";
91}
92
93
94RE2::RE2(const char* pattern) {
95  Init(pattern, DefaultOptions);
96}
97
98RE2::RE2(const string& pattern) {
99  Init(pattern, DefaultOptions);
100}
101
102RE2::RE2(const StringPiece& pattern) {
103  Init(pattern, DefaultOptions);
104}
105
106RE2::RE2(const StringPiece& pattern, const Options& options) {
107  Init(pattern, options);
108}
109
110int RE2::Options::ParseFlags() const {
111  int flags = Regexp::ClassNL;
112  switch (encoding()) {
113    default:
114      LOG(ERROR) << "Unknown encoding " << encoding();
115      break;
116    case RE2::Options::EncodingUTF8:
117      break;
118    case RE2::Options::EncodingLatin1:
119      flags |= Regexp::Latin1;
120      break;
121  }
122
123  if (!posix_syntax())
124    flags |= Regexp::LikePerl;
125
126  if (literal())
127    flags |= Regexp::Literal;
128
129  if (never_nl())
130    flags |= Regexp::NeverNL;
131
132  if (!case_sensitive())
133    flags |= Regexp::FoldCase;
134
135  if (perl_classes())
136    flags |= Regexp::PerlClasses;
137
138  if (word_boundary())
139    flags |= Regexp::PerlB;
140
141  if (one_line())
142    flags |= Regexp::OneLine;
143
144  return flags;
145}
146
147void RE2::Init(const StringPiece& pattern, const Options& options) {
148  mutex_ = new Mutex;
149  pattern_ = pattern.as_string();
150  options_.Copy(options);
151  error_ = &empty_string;
152  error_code_ = NoError;
153  suffix_regexp_ = NULL;
154  entire_regexp_ = NULL;
155  prog_ = NULL;
156  rprog_ = NULL;
157  named_groups_ = NULL;
158  group_names_ = NULL;
159  num_captures_ = -1;
160
161  RegexpStatus status;
162  entire_regexp_ = Regexp::Parse(
163    pattern_,
164    static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
165    &status);
166  if (entire_regexp_ == NULL) {
167    if (error_ == &empty_string)
168      error_ = new string(status.Text());
169    if (options_.log_errors()) {
170      LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
171                 << status.Text();
172    }
173    error_arg_ = status.error_arg().as_string();
174    error_code_ = RegexpErrorToRE2(status.code());
175    return;
176  }
177
178  prefix_.clear();
179  prefix_foldcase_ = false;
180  re2::Regexp* suffix;
181  if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
182    suffix_regexp_ = suffix;
183  else
184    suffix_regexp_ = entire_regexp_->Incref();
185
186  // Two thirds of the memory goes to the forward Prog,
187  // one third to the reverse prog, because the forward
188  // Prog has two DFAs but the reverse prog has one.
189  prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
190  if (prog_ == NULL) {
191    if (options_.log_errors())
192      LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
193    error_ = new string("pattern too large - compile failed");
194    error_code_ = RE2::ErrorPatternTooLarge;
195    return;
196  }
197
198  // Could delay this until the first match call that
199  // cares about submatch information, but the one-pass
200  // machine's memory gets cut from the DFA memory budget,
201  // and that is harder to do if the DFA has already
202  // been built.
203  is_one_pass_ = prog_->IsOnePass();
204}
205
206// Returns rprog_, computing it if needed.
207re2::Prog* RE2::ReverseProg() const {
208  MutexLock l(mutex_);
209  if (rprog_ == NULL && error_ == &empty_string) {
210    rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
211    if (rprog_ == NULL) {
212      if (options_.log_errors())
213        LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
214      error_ = new string("pattern too large - reverse compile failed");
215      error_code_ = RE2::ErrorPatternTooLarge;
216      return NULL;
217    }
218  }
219  return rprog_;
220}
221
222static const map<string, int> empty_named_groups;
223static const map<int, string> empty_group_names;
224
225RE2::~RE2() {
226  if (suffix_regexp_)
227    suffix_regexp_->Decref();
228  if (entire_regexp_)
229    entire_regexp_->Decref();
230  delete mutex_;
231  delete prog_;
232  delete rprog_;
233  if (error_ != &empty_string)
234    delete error_;
235  if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
236    delete named_groups_;
237  if (group_names_ != NULL &&  group_names_ != &empty_group_names)
238    delete group_names_;
239}
240
241int RE2::ProgramSize() const {
242  if (prog_ == NULL)
243    return -1;
244  return prog_->size();
245}
246
247// Returns named_groups_, computing it if needed.
248const map<string, int>&  RE2::NamedCapturingGroups() const {
249  MutexLock l(mutex_);
250  if (!ok())
251    return empty_named_groups;
252  if (named_groups_ == NULL) {
253    named_groups_ = suffix_regexp_->NamedCaptures();
254    if (named_groups_ == NULL)
255      named_groups_ = &empty_named_groups;
256  }
257  return *named_groups_;
258}
259
260// Returns group_names_, computing it if needed.
261const map<int, string>&  RE2::CapturingGroupNames() const {
262  MutexLock l(mutex_);
263  if (!ok())
264    return empty_group_names;
265  if (group_names_ == NULL) {
266    group_names_ = suffix_regexp_->CaptureNames();
267    if (group_names_ == NULL)
268      group_names_ = &empty_group_names;
269  }
270  return *group_names_;
271}
272
273/***** Convenience interfaces *****/
274
275bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
276                     const Arg* const args[], int n) {
277  return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
278}
279
280bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
281                        const Arg* const args[], int n) {
282  return re.DoMatch(text, UNANCHORED, NULL, args, n);
283}
284
285bool RE2::ConsumeN(StringPiece* input, const RE2& re,
286                   const Arg* const args[], int n) {
287  int consumed;
288  if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
289    input->remove_prefix(consumed);
290    return true;
291  } else {
292    return false;
293  }
294}
295
296bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
297                          const Arg* const args[], int n) {
298  int consumed;
299  if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
300    input->remove_prefix(consumed);
301    return true;
302  } else {
303    return false;
304  }
305}
306
307// Returns the maximum submatch needed for the rewrite to be done by Replace().
308// E.g. if rewrite == "foo \\2,\\1", returns 2.
309static int MaxSubmatch(const StringPiece& rewrite) {
310  int max = 0;
311  for (const char *s = rewrite.data(), *end = s + rewrite.size();
312       s < end; s++) {
313    if (*s == '\\') {
314      s++;
315      int c = (s < end) ? *s : -1;
316      if (isdigit(c)) {
317        int n = (c - '0');
318        if (n > max)
319          max = n;
320      }
321    }
322  }
323  return max;
324}
325
326bool RE2::Replace(string *str,
327                 const RE2& re,
328                 const StringPiece& rewrite) {
329  StringPiece vec[kVecSize];
330  int nvec = 1 + MaxSubmatch(rewrite);
331  if (nvec > arraysize(vec))
332    return false;
333  if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
334    return false;
335
336  string s;
337  if (!re.Rewrite(&s, rewrite, vec, nvec))
338    return false;
339
340  assert(vec[0].begin() >= str->data());
341  assert(vec[0].end() <= str->data()+str->size());
342  str->replace(vec[0].data() - str->data(), vec[0].size(), s);
343  return true;
344}
345
346int RE2::GlobalReplace(string *str,
347                      const RE2& re,
348                      const StringPiece& rewrite) {
349  StringPiece vec[kVecSize];
350  int nvec = 1 + MaxSubmatch(rewrite);
351  if (nvec > arraysize(vec))
352    return false;
353
354  const char* p = str->data();
355  const char* ep = p + str->size();
356  const char* lastend = NULL;
357  string out;
358  int count = 0;
359  while (p <= ep) {
360    if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
361      break;
362    if (p < vec[0].begin())
363      out.append(p, vec[0].begin() - p);
364    if (vec[0].begin() == lastend && vec[0].size() == 0) {
365      // Disallow empty match at end of last match: skip ahead.
366      if (p < ep)
367        out.append(p, 1);
368      p++;
369      continue;
370    }
371    re.Rewrite(&out, rewrite, vec, nvec);
372    p = vec[0].end();
373    lastend = p;
374    count++;
375  }
376
377  if (count == 0)
378    return 0;
379
380  if (p < ep)
381    out.append(p, ep - p);
382  swap(out, *str);
383  return count;
384}
385
386bool RE2::Extract(const StringPiece &text,
387                 const RE2& re,
388                 const StringPiece &rewrite,
389                 string *out) {
390  StringPiece vec[kVecSize];
391  int nvec = 1 + MaxSubmatch(rewrite);
392  if (nvec > arraysize(vec))
393    return false;
394
395  if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
396    return false;
397
398  out->clear();
399  return re.Rewrite(out, rewrite, vec, nvec);
400}
401
402string RE2::QuoteMeta(const StringPiece& unquoted) {
403  string result;
404  result.reserve(unquoted.size() << 1);
405
406  // Escape any ascii character not in [A-Za-z_0-9].
407  //
408  // Note that it's legal to escape a character even if it has no
409  // special meaning in a regular expression -- so this function does
410  // that.  (This also makes it identical to the perl function of the
411  // same name except for the null-character special case;
412  // see `perldoc -f quotemeta`.)
413  for (int ii = 0; ii < unquoted.length(); ++ii) {
414    // Note that using 'isalnum' here raises the benchmark time from
415    // 32ns to 58ns:
416    if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
417        (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
418        (unquoted[ii] < '0' || unquoted[ii] > '9') &&
419        unquoted[ii] != '_' &&
420        // If this is the part of a UTF8 or Latin1 character, we need
421        // to copy this byte without escaping.  Experimentally this is
422        // what works correctly with the regexp library.
423        !(unquoted[ii] & 128)) {
424      if (unquoted[ii] == '\0') {  // Special handling for null chars.
425        // Note that this special handling is not strictly required for RE2,
426        // but this quoting is required for other regexp libraries such as
427        // PCRE.
428        // Can't use "\\0" since the next character might be a digit.
429        result += "\\x00";
430        continue;
431      }
432      result += '\\';
433    }
434    result += unquoted[ii];
435  }
436
437  return result;
438}
439
440bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
441  if (prog_ == NULL)
442    return false;
443
444  int n = prefix_.size();
445  if (n > maxlen)
446    n = maxlen;
447
448  // Determine initial min max from prefix_ literal.
449  string pmin, pmax;
450  pmin = prefix_.substr(0, n);
451  pmax = prefix_.substr(0, n);
452  if (prefix_foldcase_) {
453    // prefix is ASCII lowercase; change pmin to uppercase.
454    for (int i = 0; i < n; i++) {
455      if ('a' <= pmin[i] && pmin[i] <= 'z')
456        pmin[i] += 'A' - 'a';
457    }
458  }
459
460  // Add to prefix min max using PossibleMatchRange on regexp.
461  string dmin, dmax;
462  maxlen -= n;
463  if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
464    pmin += dmin;
465    pmax += dmax;
466  } else if (pmax.size() > 0) {
467    // prog_->PossibleMatchRange has failed us,
468    // but we still have useful information from prefix_.
469    // Round up pmax to allow any possible suffix.
470    pmax = PrefixSuccessor(pmax);
471  } else {
472    // Nothing useful.
473    *min = "";
474    *max = "";
475    return false;
476  }
477
478  *min = pmin;
479  *max = pmax;
480  return true;
481}
482
483// Avoid possible locale nonsense in standard strcasecmp.
484// The string a is known to be all lowercase.
485static int ascii_strcasecmp(const char* a, const char* b, int len) {
486  const char *ae = a + len;
487
488  for (; a < ae; a++, b++) {
489    uint8 x = *a;
490    uint8 y = *b;
491    if ('A' <= y && y <= 'Z')
492      y += 'a' - 'A';
493    if (x != y)
494      return x - y;
495  }
496  return 0;
497}
498
499
500/***** Actual matching and rewriting code *****/
501
502bool RE2::Match(const StringPiece& text,
503                int startpos,
504                int endpos,
505                Anchor re_anchor,
506                StringPiece* submatch,
507                int nsubmatch) const {
508  if (!ok() || suffix_regexp_ == NULL) {
509    if (options_.log_errors())
510      LOG(ERROR) << "Invalid RE2: " << *error_;
511    return false;
512  }
513
514  if (startpos < 0 || startpos > endpos || endpos > text.size()) {
515    LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
516    return false;
517  }
518
519  StringPiece subtext = text;
520  subtext.remove_prefix(startpos);
521  subtext.remove_suffix(text.size() - endpos);
522
523  // Use DFAs to find exact location of match, filter out non-matches.
524
525  // Don't ask for the location if we won't use it.
526  // SearchDFA can do extra optimizations in that case.
527  StringPiece match;
528  StringPiece* matchp = &match;
529  if (nsubmatch == 0)
530    matchp = NULL;
531
532  int ncap = 1 + NumberOfCapturingGroups();
533  if (ncap > nsubmatch)
534    ncap = nsubmatch;
535
536  // If the regexp is anchored explicitly, must not be in middle of text.
537  if (prog_->anchor_start() && startpos != 0)
538    return false;
539
540  // If the regexp is anchored explicitly, update re_anchor
541  // so that we can potentially fall into a faster case below.
542  if (prog_->anchor_start() && prog_->anchor_end())
543    re_anchor = ANCHOR_BOTH;
544  else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
545    re_anchor = ANCHOR_START;
546
547  // Check for the required prefix, if any.
548  int prefixlen = 0;
549  if (!prefix_.empty()) {
550    if (startpos != 0)
551      return false;
552    prefixlen = prefix_.size();
553    if (prefixlen > subtext.size())
554      return false;
555    if (prefix_foldcase_) {
556      if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
557        return false;
558    } else {
559      if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
560        return false;
561    }
562    subtext.remove_prefix(prefixlen);
563    // If there is a required prefix, the anchor must be at least ANCHOR_START.
564    if (re_anchor != ANCHOR_BOTH)
565      re_anchor = ANCHOR_START;
566  }
567
568  Prog::Anchor anchor = Prog::kUnanchored;
569  Prog::MatchKind kind = Prog::kFirstMatch;
570  if (options_.longest_match())
571    kind = Prog::kLongestMatch;
572  bool skipped_test = false;
573
574  bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
575
576  // SearchBitState allocates a bit vector of size prog_->size() * text.size().
577  // It also allocates a stack of 3-word structures which could potentially
578  // grow as large as prog_->size() * text.size() but in practice is much
579  // smaller.
580  // Conditions for using SearchBitState:
581  const int MaxBitStateProg = 500;   // prog_->size() <= Max.
582  const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
583  bool can_bit_state = prog_->size() <= MaxBitStateProg;
584  int bit_state_text_max = MaxBitStateVector / prog_->size();
585
586  bool dfa_failed = false;
587  switch (re_anchor) {
588    default:
589    case UNANCHORED: {
590      if (!prog_->SearchDFA(subtext, text, anchor, kind,
591                            matchp, &dfa_failed, NULL)) {
592        if (dfa_failed) {
593          // Fall back to NFA below.
594          skipped_test = true;
595          if (FLAGS_trace_re2)
596            LOG(INFO) << "Match " << trunc(pattern_)
597                      << " [" << CEscape(subtext) << "]"
598                      << " DFA failed.";
599          break;
600        }
601        if (FLAGS_trace_re2)
602          LOG(INFO) << "Match " << trunc(pattern_)
603                    << " [" << CEscape(subtext) << "]"
604                    << " used DFA - no match.";
605        return false;
606      }
607      if (FLAGS_trace_re2)
608        LOG(INFO) << "Match " << trunc(pattern_)
609                  << " [" << CEscape(subtext) << "]"
610                  << " used DFA - match";
611      if (matchp == NULL)  // Matched.  Don't care where
612        return true;
613      // SearchDFA set match[0].end() but didn't know where the
614      // match started.  Run the regexp backward from match[0].end()
615      // to find the longest possible match -- that's where it started.
616      Prog* prog = ReverseProg();
617      if (prog == NULL)
618        return false;
619      if (!prog->SearchDFA(match, text, Prog::kAnchored,
620                           Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
621        if (dfa_failed) {
622          // Fall back to NFA below.
623          skipped_test = true;
624          if (FLAGS_trace_re2)
625            LOG(INFO) << "Match " << trunc(pattern_)
626                      << " [" << CEscape(subtext) << "]"
627                      << " reverse DFA failed.";
628          break;
629        }
630        if (FLAGS_trace_re2)
631          LOG(INFO) << "Match " << trunc(pattern_)
632                    << " [" << CEscape(subtext) << "]"
633                    << " DFA inconsistency.";
634        LOG(ERROR) << "DFA inconsistency";
635        return false;
636      }
637      if (FLAGS_trace_re2)
638        LOG(INFO) << "Match " << trunc(pattern_)
639                  << " [" << CEscape(subtext) << "]"
640                  << " used reverse DFA.";
641      break;
642    }
643
644    case ANCHOR_BOTH:
645    case ANCHOR_START:
646      if (re_anchor == ANCHOR_BOTH)
647        kind = Prog::kFullMatch;
648      anchor = Prog::kAnchored;
649
650      // If only a small amount of text and need submatch
651      // information anyway and we're going to use OnePass or BitState
652      // to get it, we might as well not even bother with the DFA:
653      // OnePass or BitState will be fast enough.
654      // On tiny texts, OnePass outruns even the DFA, and
655      // it doesn't have the shared state and occasional mutex that
656      // the DFA does.
657      if (can_one_pass && text.size() <= 4096 &&
658          (ncap > 1 || text.size() <= 8)) {
659        if (FLAGS_trace_re2)
660          LOG(INFO) << "Match " << trunc(pattern_)
661                    << " [" << CEscape(subtext) << "]"
662                    << " skipping DFA for OnePass.";
663        skipped_test = true;
664        break;
665      }
666      if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
667        if (FLAGS_trace_re2)
668          LOG(INFO) << "Match " << trunc(pattern_)
669                    << " [" << CEscape(subtext) << "]"
670                    << " skipping DFA for BitState.";
671        skipped_test = true;
672        break;
673      }
674      if (!prog_->SearchDFA(subtext, text, anchor, kind,
675                            &match, &dfa_failed, NULL)) {
676        if (dfa_failed) {
677          if (FLAGS_trace_re2)
678            LOG(INFO) << "Match " << trunc(pattern_)
679                      << " [" << CEscape(subtext) << "]"
680                      << " DFA failed.";
681          skipped_test = true;
682          break;
683        }
684        if (FLAGS_trace_re2)
685          LOG(INFO) << "Match " << trunc(pattern_)
686                    << " [" << CEscape(subtext) << "]"
687                    << " used DFA - no match.";
688        return false;
689      }
690      break;
691  }
692
693  if (!skipped_test && ncap <= 1) {
694    // We know exactly where it matches.  That's enough.
695    if (ncap == 1)
696      submatch[0] = match;
697  } else {
698    StringPiece subtext1;
699    if (skipped_test) {
700      // DFA ran out of memory or was skipped:
701      // need to search in entire original text.
702      subtext1 = subtext;
703    } else {
704      // DFA found the exact match location:
705      // let NFA run an anchored, full match search
706      // to find submatch locations.
707      subtext1 = match;
708      anchor = Prog::kAnchored;
709      kind = Prog::kFullMatch;
710    }
711
712    if (can_one_pass && anchor != Prog::kUnanchored) {
713      if (FLAGS_trace_re2)
714        LOG(INFO) << "Match " << trunc(pattern_)
715                  << " [" << CEscape(subtext) << "]"
716                  << " using OnePass.";
717      if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
718        if (!skipped_test)
719          LOG(ERROR) << "SearchOnePass inconsistency";
720        return false;
721      }
722    } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
723      if (FLAGS_trace_re2)
724        LOG(INFO) << "Match " << trunc(pattern_)
725                  << " [" << CEscape(subtext) << "]"
726                  << " using BitState.";
727      if (!prog_->SearchBitState(subtext1, text, anchor,
728                                 kind, submatch, ncap)) {
729        if (!skipped_test)
730          LOG(ERROR) << "SearchBitState inconsistency";
731        return false;
732      }
733    } else {
734      if (FLAGS_trace_re2)
735        LOG(INFO) << "Match " << trunc(pattern_)
736                  << " [" << CEscape(subtext) << "]"
737                  << " using NFA.";
738      if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
739        if (!skipped_test)
740          LOG(ERROR) << "SearchNFA inconsistency";
741        return false;
742      }
743    }
744  }
745
746  // Adjust overall match for required prefix that we stripped off.
747  if (prefixlen > 0 && nsubmatch > 0)
748    submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
749                              submatch[0].size() + prefixlen);
750
751  // Zero submatches that don't exist in the regexp.
752  for (int i = ncap; i < nsubmatch; i++)
753    submatch[i] = NULL;
754  return true;
755}
756
757// Internal matcher - like Match() but takes Args not StringPieces.
758bool RE2::DoMatch(const StringPiece& text,
759                  Anchor anchor,
760                  int* consumed,
761                  const Arg* const* args,
762                  int n) const {
763  if (!ok()) {
764    if (options_.log_errors())
765      LOG(ERROR) << "Invalid RE2: " << *error_;
766    return false;
767  }
768
769  // Count number of capture groups needed.
770  int nvec;
771  if (n == 0 && consumed == NULL)
772    nvec = 0;
773  else
774    nvec = n+1;
775
776  StringPiece* vec;
777  StringPiece stkvec[kVecSize];
778  StringPiece* heapvec = NULL;
779
780  if (nvec <= arraysize(stkvec)) {
781    vec = stkvec;
782  } else {
783    vec = new StringPiece[nvec];
784    heapvec = vec;
785  }
786
787  if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
788    delete[] heapvec;
789    return false;
790  }
791
792  if(consumed != NULL)
793    *consumed = vec[0].end() - text.begin();
794
795  if (n == 0 || args == NULL) {
796    // We are not interested in results
797    delete[] heapvec;
798    return true;
799  }
800
801  int ncap = NumberOfCapturingGroups();
802  if (ncap < n) {
803    // RE has fewer capturing groups than number of arg pointers passed in
804    VLOG(1) << "Asked for " << n << " but only have " << ncap;
805    delete[] heapvec;
806    return false;
807  }
808
809  // If we got here, we must have matched the whole pattern.
810  for (int i = 0; i < n; i++) {
811    const StringPiece& s = vec[i+1];
812    if (!args[i]->Parse(s.data(), s.size())) {
813      // TODO: Should we indicate what the error was?
814      VLOG(1) << "Parse error on #" << i << " " << s << " "
815	      << (void*)s.data() << "/" << s.size();
816      delete[] heapvec;
817      return false;
818    }
819  }
820
821  delete[] heapvec;
822  return true;
823}
824
825// Append the "rewrite" string, with backslash subsitutions from "vec",
826// to string "out".
827bool RE2::Rewrite(string *out, const StringPiece &rewrite,
828                 const StringPiece *vec, int veclen) const {
829  for (const char *s = rewrite.data(), *end = s + rewrite.size();
830       s < end; s++) {
831    int c = *s;
832    if (c == '\\') {
833      s++;
834      c = (s < end) ? *s : -1;
835      if (isdigit(c)) {
836        int n = (c - '0');
837        if (n >= veclen) {
838          LOG(ERROR) << "requested group " << n
839                     << " in regexp " << rewrite.data();
840          return false;
841        }
842        StringPiece snip = vec[n];
843        if (snip.size() > 0)
844          out->append(snip.data(), snip.size());
845      } else if (c == '\\') {
846        out->push_back('\\');
847      } else {
848        LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
849        return false;
850      }
851    } else {
852      out->push_back(c);
853    }
854  }
855  return true;
856}
857
858// Return the number of capturing subpatterns, or -1 if the
859// regexp wasn't valid on construction.
860int RE2::NumberOfCapturingGroups() const {
861  if (suffix_regexp_ == NULL)
862    return -1;
863  ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
864    " multiple threads end up doing the same work in parallel.");
865  if (num_captures_ == -1)
866    num_captures_ = suffix_regexp_->NumCaptures();
867  return num_captures_;
868}
869
870// Checks that the rewrite string is well-formed with respect to this
871// regular expression.
872bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
873  int max_token = -1;
874  for (const char *s = rewrite.data(), *end = s + rewrite.size();
875       s < end; s++) {
876    int c = *s;
877    if (c != '\\') {
878      continue;
879    }
880    if (++s == end) {
881      *error = "Rewrite schema error: '\\' not allowed at end.";
882      return false;
883    }
884    c = *s;
885    if (c == '\\') {
886      continue;
887    }
888    if (!isdigit(c)) {
889      *error = "Rewrite schema error: "
890               "'\\' must be followed by a digit or '\\'.";
891      return false;
892    }
893    int n = (c - '0');
894    if (max_token < n) {
895      max_token = n;
896    }
897  }
898
899  if (max_token > NumberOfCapturingGroups()) {
900    SStringPrintf(error, "Rewrite schema requests %d matches, "
901                  "but the regexp only has %d parenthesized subexpressions.",
902                  max_token, NumberOfCapturingGroups());
903    return false;
904  }
905  return true;
906}
907
908/***** Parsers for various types *****/
909
910bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
911  // We fail if somebody asked us to store into a non-NULL void* pointer
912  return (dest == NULL);
913}
914
915bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
916  if (dest == NULL) return true;
917  reinterpret_cast<string*>(dest)->assign(str, n);
918  return true;
919}
920
921bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
922  if (dest == NULL) return true;
923  reinterpret_cast<StringPiece*>(dest)->set(str, n);
924  return true;
925}
926
927bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
928  if (n != 1) return false;
929  if (dest == NULL) return true;
930  *(reinterpret_cast<char*>(dest)) = str[0];
931  return true;
932}
933
934bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
935  if (n != 1) return false;
936  if (dest == NULL) return true;
937  *(reinterpret_cast<unsigned char*>(dest)) = str[0];
938  return true;
939}
940
941// Largest number spec that we are willing to parse
942static const int kMaxNumberLength = 32;
943
944// REQUIRES "buf" must have length at least kMaxNumberLength+1
945// Copies "str" into "buf" and null-terminates.
946// Overwrites *np with the new length.
947static const char* TerminateNumber(char* buf, const char* str, int* np) {
948  int n = *np;
949  if (n <= 0) return "";
950  if (n > 0 && isspace(*str)) {
951    // We are less forgiving than the strtoxxx() routines and do not
952    // allow leading spaces.
953    return "";
954  }
955
956  // Although buf has a fixed maximum size, we can still handle
957  // arbitrarily large integers correctly by omitting leading zeros.
958  // (Numbers that are still too long will be out of range.)
959  // Before deciding whether str is too long,
960  // remove leading zeros with s/000+/00/.
961  // Leaving the leading two zeros in place means that
962  // we don't change 0000x123 (invalid) into 0x123 (valid).
963  // Skip over leading - before replacing.
964  bool neg = false;
965  if (n >= 1 && str[0] == '-') {
966    neg = true;
967    n--;
968    str++;
969  }
970
971  if (n >= 3 && str[0] == '0' && str[1] == '0') {
972    while (n >= 3 && str[2] == '0') {
973      n--;
974      str++;
975    }
976  }
977
978  if (neg) {  // make room in buf for -
979    n++;
980    str--;
981  }
982
983  if (n > kMaxNumberLength) return "";
984
985  memmove(buf, str, n);
986  if (neg) {
987    buf[0] = '-';
988  }
989  buf[n] = '\0';
990  *np = n;
991  return buf;
992}
993
994bool RE2::Arg::parse_long_radix(const char* str,
995                               int n,
996                               void* dest,
997                               int radix) {
998  if (n == 0) return false;
999  char buf[kMaxNumberLength+1];
1000  str = TerminateNumber(buf, str, &n);
1001  char* end;
1002  errno = 0;
1003  long r = strtol(str, &end, radix);
1004  if (end != str + n) return false;   // Leftover junk
1005  if (errno) return false;
1006  if (dest == NULL) return true;
1007  *(reinterpret_cast<long*>(dest)) = r;
1008  return true;
1009}
1010
1011bool RE2::Arg::parse_ulong_radix(const char* str,
1012                                int n,
1013                                void* dest,
1014                                int radix) {
1015  if (n == 0) return false;
1016  char buf[kMaxNumberLength+1];
1017  str = TerminateNumber(buf, str, &n);
1018  if (str[0] == '-') {
1019   // strtoul() will silently accept negative numbers and parse
1020   // them.  This module is more strict and treats them as errors.
1021   return false;
1022  }
1023
1024  char* end;
1025  errno = 0;
1026  unsigned long r = strtoul(str, &end, radix);
1027  if (end != str + n) return false;   // Leftover junk
1028  if (errno) return false;
1029  if (dest == NULL) return true;
1030  *(reinterpret_cast<unsigned long*>(dest)) = r;
1031  return true;
1032}
1033
1034bool RE2::Arg::parse_short_radix(const char* str,
1035                                int n,
1036                                void* dest,
1037                                int radix) {
1038  long r;
1039  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1040  if ((short)r != r) return false;       // Out of range
1041  if (dest == NULL) return true;
1042  *(reinterpret_cast<short*>(dest)) = r;
1043  return true;
1044}
1045
1046bool RE2::Arg::parse_ushort_radix(const char* str,
1047                                 int n,
1048                                 void* dest,
1049                                 int radix) {
1050  unsigned long r;
1051  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1052  if ((ushort)r != r) return false;                      // Out of range
1053  if (dest == NULL) return true;
1054  *(reinterpret_cast<unsigned short*>(dest)) = r;
1055  return true;
1056}
1057
1058bool RE2::Arg::parse_int_radix(const char* str,
1059                              int n,
1060                              void* dest,
1061                              int radix) {
1062  long r;
1063  if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1064  if ((int)r != r) return false;         // Out of range
1065  if (dest == NULL) return true;
1066  *(reinterpret_cast<int*>(dest)) = r;
1067  return true;
1068}
1069
1070bool RE2::Arg::parse_uint_radix(const char* str,
1071                               int n,
1072                               void* dest,
1073                               int radix) {
1074  unsigned long r;
1075  if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1076  if ((uint)r != r) return false;                       // Out of range
1077  if (dest == NULL) return true;
1078  *(reinterpret_cast<unsigned int*>(dest)) = r;
1079  return true;
1080}
1081
1082bool RE2::Arg::parse_longlong_radix(const char* str,
1083                                   int n,
1084                                   void* dest,
1085                                   int radix) {
1086  if (n == 0) return false;
1087  char buf[kMaxNumberLength+1];
1088  str = TerminateNumber(buf, str, &n);
1089  char* end;
1090  errno = 0;
1091  int64 r = strtoll(str, &end, radix);
1092  if (end != str + n) return false;   // Leftover junk
1093  if (errno) return false;
1094  if (dest == NULL) return true;
1095  *(reinterpret_cast<int64*>(dest)) = r;
1096  return true;
1097}
1098
1099bool RE2::Arg::parse_ulonglong_radix(const char* str,
1100                                    int n,
1101                                    void* dest,
1102                                    int radix) {
1103  if (n == 0) return false;
1104  char buf[kMaxNumberLength+1];
1105  str = TerminateNumber(buf, str, &n);
1106  if (str[0] == '-') {
1107    // strtoull() will silently accept negative numbers and parse
1108    // them.  This module is more strict and treats them as errors.
1109    return false;
1110  }
1111  char* end;
1112  errno = 0;
1113  uint64 r = strtoull(str, &end, radix);
1114  if (end != str + n) return false;   // Leftover junk
1115  if (errno) return false;
1116  if (dest == NULL) return true;
1117  *(reinterpret_cast<uint64*>(dest)) = r;
1118  return true;
1119}
1120
1121static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1122  if (n == 0) return false;
1123  static const int kMaxLength = 200;
1124  char buf[kMaxLength];
1125  if (n >= kMaxLength) return false;
1126  memcpy(buf, str, n);
1127  buf[n] = '\0';
1128  errno = 0;
1129  char* end;
1130  double r;
1131  if (isfloat) {
1132    r = strtof(buf, &end);
1133  } else {
1134    r = strtod(buf, &end);
1135  }
1136  if (end != buf + n) return false;   // Leftover junk
1137  if (errno) return false;
1138  if (dest == NULL) return true;
1139  if (isfloat) {
1140    *(reinterpret_cast<float*>(dest)) = r;
1141  } else {
1142    *(reinterpret_cast<double*>(dest)) = r;
1143  }
1144  return true;
1145}
1146
1147bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1148  return parse_double_float(str, n, false, dest);
1149}
1150
1151bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1152  return parse_double_float(str, n, true, dest);
1153}
1154
1155
1156#define DEFINE_INTEGER_PARSERS(name)                                        \
1157  bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
1158    return parse_##name##_radix(str, n, dest, 10);                          \
1159  }                                                                         \
1160  bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
1161    return parse_##name##_radix(str, n, dest, 16);                          \
1162  }                                                                         \
1163  bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
1164    return parse_##name##_radix(str, n, dest, 8);                           \
1165  }                                                                         \
1166  bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1167    return parse_##name##_radix(str, n, dest, 0);                           \
1168  }
1169
1170DEFINE_INTEGER_PARSERS(short);
1171DEFINE_INTEGER_PARSERS(ushort);
1172DEFINE_INTEGER_PARSERS(int);
1173DEFINE_INTEGER_PARSERS(uint);
1174DEFINE_INTEGER_PARSERS(long);
1175DEFINE_INTEGER_PARSERS(ulong);
1176DEFINE_INTEGER_PARSERS(longlong);
1177DEFINE_INTEGER_PARSERS(ulonglong);
1178
1179#undef DEFINE_INTEGER_PARSERS
1180
1181}  // namespace re2
1182