1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression representation.
6// Tested by parse_test.cc
7
8#include "util/util.h"
9#include "re2/regexp.h"
10#include "re2/stringpiece.h"
11#include "re2/walker-inl.h"
12
13namespace re2 {
14
15// Constructor.  Allocates vectors as appropriate for operator.
16Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17  : op_(op),
18    simple_(false),
19    parse_flags_(static_cast<uint16>(parse_flags)),
20    ref_(1),
21    nsub_(0),
22    down_(NULL) {
23  subone_ = NULL;
24  memset(the_union_, 0, sizeof the_union_);
25}
26
27// Destructor.  Assumes already cleaned up children.
28// Private: use Decref() instead of delete to destroy Regexps.
29// Can't call Decref on the sub-Regexps here because
30// that could cause arbitrarily deep recursion, so
31// required Decref() to have handled them for us.
32Regexp::~Regexp() {
33  if (nsub_ > 0)
34    LOG(DFATAL) << "Regexp not destroyed.";
35
36  switch (op_) {
37    default:
38      break;
39    case kRegexpCapture:
40      delete name_;
41      break;
42    case kRegexpLiteralString:
43      delete[] runes_;
44      break;
45    case kRegexpCharClass:
46      cc_->Delete();
47      delete ccb_;
48      break;
49  }
50}
51
52// If it's possible to destroy this regexp without recurring,
53// do so and return true.  Else return false.
54bool Regexp::QuickDestroy() {
55  if (nsub_ == 0) {
56    delete this;
57    return true;
58  }
59  return false;
60}
61
62static map<Regexp*, int> ref_map;
63static Mutex ref_mutex;
64
65int Regexp::Ref() {
66  if (ref_ < kMaxRef)
67    return ref_;
68
69  MutexLock l(&ref_mutex);
70  return ref_map[this];
71}
72
73// Increments reference count, returns object as convenience.
74Regexp* Regexp::Incref() {
75  if (ref_ >= kMaxRef-1) {
76    // Store ref count in overflow map.
77    MutexLock l(&ref_mutex);
78    if (ref_ == kMaxRef) {  // already overflowed
79      ref_map[this]++;
80      return this;
81    }
82    // overflowing now
83    ref_map[this] = kMaxRef;
84    ref_ = kMaxRef;
85    return this;
86  }
87
88  ref_++;
89  return this;
90}
91
92// Decrements reference count and deletes this object if count reaches 0.
93void Regexp::Decref() {
94  if (ref_ == kMaxRef) {
95    // Ref count is stored in overflow map.
96    MutexLock l(&ref_mutex);
97    int r = ref_map[this] - 1;
98    if (r < kMaxRef) {
99      ref_ = r;
100      ref_map.erase(this);
101    } else {
102      ref_map[this] = r;
103    }
104    return;
105  }
106  ref_--;
107  if (ref_ == 0)
108    Destroy();
109}
110
111// Deletes this object; ref count has count reached 0.
112void Regexp::Destroy() {
113  if (QuickDestroy())
114    return;
115
116  // Handle recursive Destroy with explicit stack
117  // to avoid arbitrarily deep recursion on process stack [sigh].
118  down_ = NULL;
119  Regexp* stack = this;
120  while (stack != NULL) {
121    Regexp* re = stack;
122    stack = re->down_;
123    if (re->ref_ != 0)
124      LOG(DFATAL) << "Bad reference count " << re->ref_;
125    if (re->nsub_ > 0) {
126      Regexp** subs = re->sub();
127      for (int i = 0; i < re->nsub_; i++) {
128        Regexp* sub = subs[i];
129        if (sub == NULL)
130          continue;
131        if (sub->ref_ == kMaxRef)
132          sub->Decref();
133        else
134          --sub->ref_;
135        if (sub->ref_ == 0 && !sub->QuickDestroy()) {
136          sub->down_ = stack;
137          stack = sub;
138        }
139      }
140      if (re->nsub_ > 1)
141        delete[] subs;
142      re->nsub_ = 0;
143    }
144    delete re;
145  }
146}
147
148void Regexp::AddRuneToString(Rune r) {
149  DCHECK(op_ == kRegexpLiteralString);
150  if (nrunes_ == 0) {
151    // start with 8
152    runes_ = new Rune[8];
153  } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
154    // double on powers of two
155    Rune *old = runes_;
156    runes_ = new Rune[nrunes_ * 2];
157    for (int i = 0; i < nrunes_; i++)
158      runes_[i] = old[i];
159    delete[] old;
160  }
161
162  runes_[nrunes_++] = r;
163}
164
165Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
166  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
167  re->match_id_ = match_id;
168  return re;
169}
170
171Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
172  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
173    return sub;
174  Regexp* re = new Regexp(kRegexpPlus, flags);
175  re->AllocSub(1);
176  re->sub()[0] = sub;
177  return re;
178}
179
180Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
181  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
182    return sub;
183  Regexp* re = new Regexp(kRegexpStar, flags);
184  re->AllocSub(1);
185  re->sub()[0] = sub;
186  return re;
187}
188
189Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
190  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
191    return sub;
192  Regexp* re = new Regexp(kRegexpQuest, flags);
193  re->AllocSub(1);
194  re->sub()[0] = sub;
195  return re;
196}
197
198Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
199                                  ParseFlags flags, bool can_factor) {
200  if (nsub == 1)
201    return sub[0];
202
203  Regexp** subcopy = NULL;
204  if (op == kRegexpAlternate && can_factor) {
205    // Going to edit sub; make a copy so we don't step on caller.
206    subcopy = new Regexp*[nsub];
207    memmove(subcopy, sub, nsub * sizeof sub[0]);
208    sub = subcopy;
209    nsub = FactorAlternation(sub, nsub, flags);
210    if (nsub == 1) {
211      Regexp* re = sub[0];
212      delete[] subcopy;
213      return re;
214    }
215  }
216
217  if (nsub > kMaxNsub) {
218    // Too many subexpressions to fit in a single Regexp.
219    // Make a two-level tree.  Two levels gets us to 65535^2.
220    int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
221    Regexp* re = new Regexp(op, flags);
222    re->AllocSub(nbigsub);
223    Regexp** subs = re->sub();
224    for (int i = 0; i < nbigsub - 1; i++)
225      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
226    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
227                                          nsub - (nbigsub-1)*kMaxNsub, flags,
228                                          false);
229    delete[] subcopy;
230    return re;
231  }
232
233  Regexp* re = new Regexp(op, flags);
234  re->AllocSub(nsub);
235  Regexp** subs = re->sub();
236  for (int i = 0; i < nsub; i++)
237    subs[i] = sub[i];
238
239  delete[] subcopy;
240  return re;
241}
242
243Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
244  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
245}
246
247Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
248  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
249}
250
251Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
252  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
253}
254
255Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
256  Regexp* re = new Regexp(kRegexpCapture, flags);
257  re->AllocSub(1);
258  re->sub()[0] = sub;
259  re->cap_ = cap;
260  return re;
261}
262
263Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
264  Regexp* re = new Regexp(kRegexpRepeat, flags);
265  re->AllocSub(1);
266  re->sub()[0] = sub;
267  re->min_ = min;
268  re->max_ = max;
269  return re;
270}
271
272Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
273  Regexp* re = new Regexp(kRegexpLiteral, flags);
274  re->rune_ = rune;
275  return re;
276}
277
278Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
279  if (nrunes <= 0)
280    return new Regexp(kRegexpEmptyMatch, flags);
281  if (nrunes == 1)
282    return NewLiteral(runes[0], flags);
283  Regexp* re = new Regexp(kRegexpLiteralString, flags);
284  for (int i = 0; i < nrunes; i++)
285    re->AddRuneToString(runes[i]);
286  return re;
287}
288
289Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
290  Regexp* re = new Regexp(kRegexpCharClass, flags);
291  re->cc_ = cc;
292  return re;
293}
294
295// Swaps this and that in place.
296void Regexp::Swap(Regexp* that) {
297  // Can use memmove because Regexp is just a struct (no vtable).
298  char tmp[sizeof *this];
299  memmove(tmp, this, sizeof tmp);
300  memmove(this, that, sizeof tmp);
301  memmove(that, tmp, sizeof tmp);
302}
303
304// Tests equality of all top-level structure but not subregexps.
305static bool TopEqual(Regexp* a, Regexp* b) {
306  if (a->op() != b->op())
307    return false;
308
309  switch (a->op()) {
310    case kRegexpNoMatch:
311    case kRegexpEmptyMatch:
312    case kRegexpAnyChar:
313    case kRegexpAnyByte:
314    case kRegexpBeginLine:
315    case kRegexpEndLine:
316    case kRegexpWordBoundary:
317    case kRegexpNoWordBoundary:
318    case kRegexpBeginText:
319      return true;
320
321    case kRegexpEndText:
322      // The parse flags remember whether it's \z or (?-m:$),
323      // which matters when testing against PCRE.
324      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
325
326    case kRegexpLiteral:
327      return a->rune() == b->rune() &&
328             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
329
330    case kRegexpLiteralString:
331      return a->nrunes() == b->nrunes() &&
332             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
333             memcmp(a->runes(), b->runes(),
334                    a->nrunes() * sizeof a->runes()[0]) == 0;
335
336    case kRegexpAlternate:
337    case kRegexpConcat:
338      return a->nsub() == b->nsub();
339
340    case kRegexpStar:
341    case kRegexpPlus:
342    case kRegexpQuest:
343      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
344
345    case kRegexpRepeat:
346      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
347             a->min() == b->min() &&
348             a->max() == b->max();
349
350    case kRegexpCapture:
351      return a->cap() == b->cap() && a->name() == b->name();
352
353    case kRegexpHaveMatch:
354      return a->match_id() == b->match_id();
355
356    case kRegexpCharClass: {
357      CharClass* acc = a->cc();
358      CharClass* bcc = b->cc();
359      return acc->size() == bcc->size() &&
360             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
361             memcmp(acc->begin(), bcc->begin(),
362                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
363    }
364  }
365
366  LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
367  return 0;
368}
369
370bool Regexp::Equal(Regexp* a, Regexp* b) {
371  if (a == NULL || b == NULL)
372    return a == b;
373
374  if (!TopEqual(a, b))
375    return false;
376
377  // Fast path:
378  // return without allocating vector if there are no subregexps.
379  switch (a->op()) {
380    case kRegexpAlternate:
381    case kRegexpConcat:
382    case kRegexpStar:
383    case kRegexpPlus:
384    case kRegexpQuest:
385    case kRegexpRepeat:
386    case kRegexpCapture:
387      break;
388
389    default:
390      return true;
391  }
392
393  // Committed to doing real work.
394  // The stack (vector) has pairs of regexps waiting to
395  // be compared.  The regexps are only equal if
396  // all the pairs end up being equal.
397  vector<Regexp*> stk;
398
399  for (;;) {
400    // Invariant: TopEqual(a, b) == true.
401    Regexp* a2;
402    Regexp* b2;
403    switch (a->op()) {
404      default:
405        break;
406      case kRegexpAlternate:
407      case kRegexpConcat:
408        for (int i = 0; i < a->nsub(); i++) {
409          a2 = a->sub()[i];
410          b2 = b->sub()[i];
411          if (!TopEqual(a2, b2))
412            return false;
413          stk.push_back(a2);
414          stk.push_back(b2);
415        }
416        break;
417
418      case kRegexpStar:
419      case kRegexpPlus:
420      case kRegexpQuest:
421      case kRegexpRepeat:
422      case kRegexpCapture:
423        a2 = a->sub()[0];
424        b2 = b->sub()[0];
425        if (!TopEqual(a2, b2))
426          return false;
427        // Really:
428        //   stk.push_back(a2);
429        //   stk.push_back(b2);
430        //   break;
431        // but faster to assign directly and loop.
432        a = a2;
433        b = b2;
434        continue;
435    }
436
437    int n = stk.size();
438    if (n == 0)
439      break;
440
441    a = stk[n-2];
442    b = stk[n-1];
443    stk.resize(n-2);
444  }
445
446  return true;
447}
448
449// Keep in sync with enum RegexpStatusCode in regexp.h
450static const string kErrorStrings[] = {
451  "no error",
452  "unexpected error",
453  "invalid escape sequence",
454  "invalid character class",
455  "invalid character class range",
456  "missing ]",
457  "missing )",
458  "trailing \\",
459  "no argument for repetition operator",
460  "invalid repetition size",
461  "bad repetition operator",
462  "invalid perl operator",
463  "invalid UTF-8",
464  "invalid named capture group",
465};
466
467const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
468  if (code < 0 || code >= arraysize(kErrorStrings))
469    code = kRegexpInternalError;
470  return kErrorStrings[code];
471}
472
473string RegexpStatus::Text() const {
474  if (error_arg_.empty())
475    return CodeText(code_);
476  string s;
477  s.append(CodeText(code_));
478  s.append(": ");
479  s.append(error_arg_.data(), error_arg_.size());
480  return s;
481}
482
483void RegexpStatus::Copy(const RegexpStatus& status) {
484  code_ = status.code_;
485  error_arg_ = status.error_arg_;
486}
487
488typedef int Ignored;  // Walker<void> doesn't exist
489
490// Walker subclass to count capturing parens in regexp.
491class NumCapturesWalker : public Regexp::Walker<Ignored> {
492 public:
493  NumCapturesWalker() : ncapture_(0) {}
494  int ncapture() { return ncapture_; }
495
496  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
497    if (re->op() == kRegexpCapture)
498      ncapture_++;
499    return ignored;
500  }
501  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
502    // Should never be called: we use Walk not WalkExponential.
503    LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
504    return ignored;
505  }
506
507 private:
508  int ncapture_;
509  DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
510};
511
512int Regexp::NumCaptures() {
513  NumCapturesWalker w;
514  w.Walk(this, 0);
515  return w.ncapture();
516}
517
518// Walker class to build map of named capture groups and their indices.
519class NamedCapturesWalker : public Regexp::Walker<Ignored> {
520 public:
521  NamedCapturesWalker() : map_(NULL) {}
522  ~NamedCapturesWalker() { delete map_; }
523
524  map<string, int>* TakeMap() {
525    map<string, int>* m = map_;
526    map_ = NULL;
527    return m;
528  }
529
530  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
531    if (re->op() == kRegexpCapture && re->name() != NULL) {
532      // Allocate map once we find a name.
533      if (map_ == NULL)
534        map_ = new map<string, int>;
535
536      // Record first occurrence of each name.
537      // (The rule is that if you have the same name
538      // multiple times, only the leftmost one counts.)
539      if (map_->find(*re->name()) == map_->end())
540        (*map_)[*re->name()] = re->cap();
541    }
542    return ignored;
543  }
544
545  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546    // Should never be called: we use Walk not WalkExponential.
547    LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
548    return ignored;
549  }
550
551 private:
552  map<string, int>* map_;
553  DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
554};
555
556map<string, int>* Regexp::NamedCaptures() {
557  NamedCapturesWalker w;
558  w.Walk(this, 0);
559  return w.TakeMap();
560}
561
562// Walker class to build map from capture group indices to their names.
563class CaptureNamesWalker : public Regexp::Walker<Ignored> {
564 public:
565  CaptureNamesWalker() : map_(NULL) {}
566  ~CaptureNamesWalker() { delete map_; }
567
568  map<int, string>* TakeMap() {
569    map<int, string>* m = map_;
570    map_ = NULL;
571    return m;
572  }
573
574  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
575    if (re->op() == kRegexpCapture && re->name() != NULL) {
576      // Allocate map once we find a name.
577      if (map_ == NULL)
578        map_ = new map<int, string>;
579
580      (*map_)[re->cap()] = *re->name();
581    }
582    return ignored;
583  }
584
585  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
586    // Should never be called: we use Walk not WalkExponential.
587    LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
588    return ignored;
589  }
590
591 private:
592  map<int, string>* map_;
593  DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
594};
595
596map<int, string>* Regexp::CaptureNames() {
597  CaptureNamesWalker w;
598  w.Walk(this, 0);
599  return w.TakeMap();
600}
601
602// Determines whether regexp matches must be anchored
603// with a fixed string prefix.  If so, returns the prefix and
604// the regexp that remains after the prefix.  The prefix might
605// be ASCII case-insensitive.
606bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
607  // No need for a walker: the regexp must be of the form
608  // 1. some number of ^ anchors
609  // 2. a literal char or string
610  // 3. the rest
611  prefix->clear();
612  *foldcase = false;
613  *suffix = NULL;
614  if (op_ != kRegexpConcat)
615    return false;
616
617  // Some number of anchors, then a literal or concatenation.
618  int i = 0;
619  Regexp** sub = this->sub();
620  while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
621    i++;
622  if (i == 0 || i >= nsub_)
623    return false;
624
625  Regexp* re = sub[i];
626  switch (re->op_) {
627    default:
628      return false;
629
630    case kRegexpLiteralString:
631      // Convert to string in proper encoding.
632      if (re->parse_flags() & Latin1) {
633        prefix->resize(re->nrunes_);
634        for (int j = 0; j < re->nrunes_; j++)
635          (*prefix)[j] = re->runes_[j];
636      } else {
637        // Convert to UTF-8 in place.
638        // Assume worst-case space and then trim.
639        prefix->resize(re->nrunes_ * UTFmax);
640        char *p = &(*prefix)[0];
641        for (int j = 0; j < re->nrunes_; j++) {
642          Rune r = re->runes_[j];
643          if (r < Runeself)
644            *p++ = r;
645          else
646            p += runetochar(p, &r);
647        }
648        prefix->resize(p - &(*prefix)[0]);
649      }
650      break;
651
652    case kRegexpLiteral:
653      if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
654        prefix->append(1, re->rune_);
655      } else {
656        char buf[UTFmax];
657        prefix->append(buf, runetochar(buf, &re->rune_));
658      }
659      break;
660  }
661  *foldcase = (sub[i]->parse_flags() & FoldCase);
662  i++;
663
664  // The rest.
665  if (i < nsub_) {
666    for (int j = i; j < nsub_; j++)
667      sub[j]->Incref();
668    re = Concat(sub + i, nsub_ - i, parse_flags());
669  } else {
670    re = new Regexp(kRegexpEmptyMatch, parse_flags());
671  }
672  *suffix = re;
673  return true;
674}
675
676// Character class builder is a balanced binary tree (STL set)
677// containing non-overlapping, non-abutting RuneRanges.
678// The less-than operator used in the tree treats two
679// ranges as equal if they overlap at all, so that
680// lookups for a particular Rune are possible.
681
682CharClassBuilder::CharClassBuilder() {
683  nrunes_ = 0;
684  upper_ = 0;
685  lower_ = 0;
686}
687
688// Add lo-hi to the class; return whether class got bigger.
689bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
690  if (hi < lo)
691    return false;
692
693  if (lo <= 'z' && hi >= 'A') {
694    // Overlaps some alpha, maybe not all.
695    // Update bitmaps telling which ASCII letters are in the set.
696    Rune lo1 = max<Rune>(lo, 'A');
697    Rune hi1 = min<Rune>(hi, 'Z');
698    if (lo1 <= hi1)
699      upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
700
701    lo1 = max<Rune>(lo, 'a');
702    hi1 = min<Rune>(hi, 'z');
703    if (lo1 <= hi1)
704      lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
705  }
706
707  {  // Check whether lo, hi is already in the class.
708    iterator it = ranges_.find(RuneRange(lo, lo));
709    if (it != end() && it->lo <= lo && hi <= it->hi)
710      return false;
711  }
712
713  // Look for a range abutting lo on the left.
714  // If it exists, take it out and increase our range.
715  if (lo > 0) {
716    iterator it = ranges_.find(RuneRange(lo-1, lo-1));
717    if (it != end()) {
718      lo = it->lo;
719      if (it->hi > hi)
720        hi = it->hi;
721      nrunes_ -= it->hi - it->lo + 1;
722      ranges_.erase(it);
723    }
724  }
725
726  // Look for a range abutting hi on the right.
727  // If it exists, take it out and increase our range.
728  if (hi < Runemax) {
729    iterator it = ranges_.find(RuneRange(hi+1, hi+1));
730    if (it != end()) {
731      hi = it->hi;
732      nrunes_ -= it->hi - it->lo + 1;
733      ranges_.erase(it);
734    }
735  }
736
737  // Look for ranges between lo and hi.  Take them out.
738  // This is only safe because the set has no overlapping ranges.
739  // We've already removed any ranges abutting lo and hi, so
740  // any that overlap [lo, hi] must be contained within it.
741  for (;;) {
742    iterator it = ranges_.find(RuneRange(lo, hi));
743    if (it == end())
744      break;
745    nrunes_ -= it->hi - it->lo + 1;
746    ranges_.erase(it);
747  }
748
749  // Finally, add [lo, hi].
750  nrunes_ += hi - lo + 1;
751  ranges_.insert(RuneRange(lo, hi));
752  return true;
753}
754
755void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
756  for (iterator it = cc->begin(); it != cc->end(); ++it)
757    AddRange(it->lo, it->hi);
758}
759
760bool CharClassBuilder::Contains(Rune r) {
761  return ranges_.find(RuneRange(r, r)) != end();
762}
763
764// Does the character class behave the same on A-Z as on a-z?
765bool CharClassBuilder::FoldsASCII() {
766  return ((upper_ ^ lower_) & AlphaMask) == 0;
767}
768
769CharClassBuilder* CharClassBuilder::Copy() {
770  CharClassBuilder* cc = new CharClassBuilder;
771  for (iterator it = begin(); it != end(); ++it)
772    cc->ranges_.insert(RuneRange(it->lo, it->hi));
773  cc->upper_ = upper_;
774  cc->lower_ = lower_;
775  cc->nrunes_ = nrunes_;
776  return cc;
777}
778
779
780
781void CharClassBuilder::RemoveAbove(Rune r) {
782  if (r >= Runemax)
783    return;
784
785  if (r < 'z') {
786    if (r < 'a')
787      lower_ = 0;
788    else
789      lower_ &= AlphaMask >> ('z' - r);
790  }
791
792  if (r < 'Z') {
793    if (r < 'A')
794      upper_ = 0;
795    else
796      upper_ &= AlphaMask >> ('Z' - r);
797  }
798
799  for (;;) {
800
801    iterator it = ranges_.find(RuneRange(r + 1, Runemax));
802    if (it == end())
803      break;
804    RuneRange rr = *it;
805    ranges_.erase(it);
806    nrunes_ -= rr.hi - rr.lo + 1;
807    if (rr.lo <= r) {
808      rr.hi = r;
809      ranges_.insert(rr);
810      nrunes_ += rr.hi - rr.lo + 1;
811    }
812  }
813}
814
815void CharClassBuilder::Negate() {
816  // Build up negation and then copy in.
817  // Could edit ranges in place, but C++ won't let me.
818  vector<RuneRange> v;
819  v.reserve(ranges_.size() + 1);
820
821  // In negation, first range begins at 0, unless
822  // the current class begins at 0.
823  iterator it = begin();
824  if (it == end()) {
825    v.push_back(RuneRange(0, Runemax));
826  } else {
827    int nextlo = 0;
828    if (it->lo == 0) {
829      nextlo = it->hi + 1;
830      ++it;
831    }
832    for (; it != end(); ++it) {
833      v.push_back(RuneRange(nextlo, it->lo - 1));
834      nextlo = it->hi + 1;
835    }
836    if (nextlo <= Runemax)
837      v.push_back(RuneRange(nextlo, Runemax));
838  }
839
840  ranges_.clear();
841  for (int i = 0; i < v.size(); i++)
842    ranges_.insert(v[i]);
843
844  upper_ = AlphaMask & ~upper_;
845  lower_ = AlphaMask & ~lower_;
846  nrunes_ = Runemax+1 - nrunes_;
847}
848
849// Character class is a sorted list of ranges.
850// The ranges are allocated in the same block as the header,
851// necessitating a special allocator and Delete method.
852
853CharClass* CharClass::New(int maxranges) {
854  CharClass* cc;
855  uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
856  cc = reinterpret_cast<CharClass*>(data);
857  cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
858  cc->nranges_ = 0;
859  cc->folds_ascii_ = false;
860  cc->nrunes_ = 0;
861  return cc;
862}
863
864void CharClass::Delete() {
865  if (this == NULL)
866    return;
867  uint8 *data = reinterpret_cast<uint8*>(this);
868  delete[] data;
869}
870
871CharClass* CharClass::Negate() {
872  CharClass* cc = CharClass::New(nranges_+1);
873  cc->folds_ascii_ = folds_ascii_;
874  cc->nrunes_ = Runemax + 1 - nrunes_;
875  int n = 0;
876  int nextlo = 0;
877  for (CharClass::iterator it = begin(); it != end(); ++it) {
878    if (it->lo == nextlo) {
879      nextlo = it->hi + 1;
880    } else {
881      cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
882      nextlo = it->hi + 1;
883    }
884  }
885  if (nextlo <= Runemax)
886    cc->ranges_[n++] = RuneRange(nextlo, Runemax);
887  cc->nranges_ = n;
888  return cc;
889}
890
891bool CharClass::Contains(Rune r) {
892  RuneRange* rr = ranges_;
893  int n = nranges_;
894  while (n > 0) {
895    int m = n/2;
896    if (rr[m].hi < r) {
897      rr += m+1;
898      n -= m+1;
899    } else if (r < rr[m].lo) {
900      n = m;
901    } else {  // rr[m].lo <= r && r <= rr[m].hi
902      return true;
903    }
904  }
905  return false;
906}
907
908CharClass* CharClassBuilder::GetCharClass() {
909  CharClass* cc = CharClass::New(ranges_.size());
910  int n = 0;
911  for (iterator it = begin(); it != end(); ++it)
912    cc->ranges_[n++] = *it;
913  cc->nranges_ = n;
914  DCHECK_LE(n, ranges_.size());
915  cc->nrunes_ = nrunes_;
916  cc->folds_ascii_ = FoldsASCII();
917  return cc;
918}
919
920}  // namespace re2
921