regexp.cc revision f8ee788a64d60abd8f2d742a5fdedde054ecd910
1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression representation.
6// Tested by parse_test.cc
7
8#include "util/util.h"
9#include "re2/regexp.h"
10#include "re2/stringpiece.h"
11#include "re2/walker-inl.h"
12
13namespace re2 {
14
15// Constructor.  Allocates vectors as appropriate for operator.
16Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17  : op_(op),
18    simple_(false),
19    parse_flags_(static_cast<uint16>(parse_flags)),
20    ref_(1),
21    nsub_(0),
22    down_(NULL) {
23  subone_ = NULL;
24  memset(the_union_, 0, sizeof the_union_);
25}
26
27// Destructor.  Assumes already cleaned up children.
28// Private: use Decref() instead of delete to destroy Regexps.
29// Can't call Decref on the sub-Regexps here because
30// that could cause arbitrarily deep recursion, so
31// required Decref() to have handled them for us.
32Regexp::~Regexp() {
33  if (nsub_ > 0)
34    LOG(DFATAL) << "Regexp not destroyed.";
35
36  switch (op_) {
37    default:
38      break;
39    case kRegexpCapture:
40      delete name_;
41      break;
42    case kRegexpLiteralString:
43      delete[] runes_;
44      break;
45    case kRegexpCharClass:
46      cc_->Delete();
47      delete ccb_;
48      break;
49  }
50}
51
52// If it's possible to destroy this regexp without recurring,
53// do so and return true.  Else return false.
54bool Regexp::QuickDestroy() {
55  if (nsub_ == 0) {
56    delete this;
57    return true;
58  }
59  return false;
60}
61
62static map<Regexp*, int> *ref_map;
63GLOBAL_MUTEX(ref_mutex);
64
65int Regexp::Ref() {
66  if (ref_ < kMaxRef)
67    return ref_;
68
69  GLOBAL_MUTEX_LOCK(ref_mutex);
70  int r = 0;
71  if (ref_map != NULL) {
72    r = (*ref_map)[this];
73  }
74  GLOBAL_MUTEX_UNLOCK(ref_mutex);
75  return r;
76}
77
78// Increments reference count, returns object as convenience.
79Regexp* Regexp::Incref() {
80  if (ref_ >= kMaxRef-1) {
81    // Store ref count in overflow map.
82    GLOBAL_MUTEX_LOCK(ref_mutex);
83    if (ref_map == NULL) {
84      ref_map = new map<Regexp*, int>;
85    }
86    if (ref_ == kMaxRef) {
87      // already overflowed
88      (*ref_map)[this]++;
89    } else {
90      // overflowing now
91      (*ref_map)[this] = kMaxRef;
92      ref_ = kMaxRef;
93    }
94    GLOBAL_MUTEX_UNLOCK(ref_mutex);
95    return this;
96  }
97
98  ref_++;
99  return this;
100}
101
102// Decrements reference count and deletes this object if count reaches 0.
103void Regexp::Decref() {
104  if (ref_ == kMaxRef) {
105    // Ref count is stored in overflow map.
106    GLOBAL_MUTEX_LOCK(ref_mutex);
107    int r = (*ref_map)[this] - 1;
108    if (r < kMaxRef) {
109      ref_ = r;
110      ref_map->erase(this);
111    } else {
112      (*ref_map)[this] = r;
113    }
114    GLOBAL_MUTEX_UNLOCK(ref_mutex);
115    return;
116  }
117  ref_--;
118  if (ref_ == 0)
119    Destroy();
120}
121
122// Deletes this object; ref count has count reached 0.
123void Regexp::Destroy() {
124  if (QuickDestroy())
125    return;
126
127  // Handle recursive Destroy with explicit stack
128  // to avoid arbitrarily deep recursion on process stack [sigh].
129  down_ = NULL;
130  Regexp* stack = this;
131  while (stack != NULL) {
132    Regexp* re = stack;
133    stack = re->down_;
134    if (re->ref_ != 0)
135      LOG(DFATAL) << "Bad reference count " << re->ref_;
136    if (re->nsub_ > 0) {
137      Regexp** subs = re->sub();
138      for (int i = 0; i < re->nsub_; i++) {
139        Regexp* sub = subs[i];
140        if (sub == NULL)
141          continue;
142        if (sub->ref_ == kMaxRef)
143          sub->Decref();
144        else
145          --sub->ref_;
146        if (sub->ref_ == 0 && !sub->QuickDestroy()) {
147          sub->down_ = stack;
148          stack = sub;
149        }
150      }
151      if (re->nsub_ > 1)
152        delete[] subs;
153      re->nsub_ = 0;
154    }
155    delete re;
156  }
157}
158
159void Regexp::AddRuneToString(Rune r) {
160  DCHECK(op_ == kRegexpLiteralString);
161  if (nrunes_ == 0) {
162    // start with 8
163    runes_ = new Rune[8];
164  } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
165    // double on powers of two
166    Rune *old = runes_;
167    runes_ = new Rune[nrunes_ * 2];
168    for (int i = 0; i < nrunes_; i++)
169      runes_[i] = old[i];
170    delete[] old;
171  }
172
173  runes_[nrunes_++] = r;
174}
175
176Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
177  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
178  re->match_id_ = match_id;
179  return re;
180}
181
182Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
183  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
184    return sub;
185  Regexp* re = new Regexp(kRegexpPlus, flags);
186  re->AllocSub(1);
187  re->sub()[0] = sub;
188  return re;
189}
190
191Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
192  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
193    return sub;
194  Regexp* re = new Regexp(kRegexpStar, flags);
195  re->AllocSub(1);
196  re->sub()[0] = sub;
197  return re;
198}
199
200Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
201  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
202    return sub;
203  Regexp* re = new Regexp(kRegexpQuest, flags);
204  re->AllocSub(1);
205  re->sub()[0] = sub;
206  return re;
207}
208
209Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
210                                  ParseFlags flags, bool can_factor) {
211  if (nsub == 1)
212    return sub[0];
213
214  Regexp** subcopy = NULL;
215  if (op == kRegexpAlternate && can_factor) {
216    // Going to edit sub; make a copy so we don't step on caller.
217    subcopy = new Regexp*[nsub];
218    memmove(subcopy, sub, nsub * sizeof sub[0]);
219    sub = subcopy;
220    nsub = FactorAlternation(sub, nsub, flags);
221    if (nsub == 1) {
222      Regexp* re = sub[0];
223      delete[] subcopy;
224      return re;
225    }
226  }
227
228  if (nsub > kMaxNsub) {
229    // Too many subexpressions to fit in a single Regexp.
230    // Make a two-level tree.  Two levels gets us to 65535^2.
231    int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
232    Regexp* re = new Regexp(op, flags);
233    re->AllocSub(nbigsub);
234    Regexp** subs = re->sub();
235    for (int i = 0; i < nbigsub - 1; i++)
236      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
237    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
238                                          nsub - (nbigsub-1)*kMaxNsub, flags,
239                                          false);
240    delete[] subcopy;
241    return re;
242  }
243
244  Regexp* re = new Regexp(op, flags);
245  re->AllocSub(nsub);
246  Regexp** subs = re->sub();
247  for (int i = 0; i < nsub; i++)
248    subs[i] = sub[i];
249
250  delete[] subcopy;
251  return re;
252}
253
254Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
255  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
256}
257
258Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
259  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
260}
261
262Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
263  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
264}
265
266Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
267  Regexp* re = new Regexp(kRegexpCapture, flags);
268  re->AllocSub(1);
269  re->sub()[0] = sub;
270  re->cap_ = cap;
271  return re;
272}
273
274Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
275  Regexp* re = new Regexp(kRegexpRepeat, flags);
276  re->AllocSub(1);
277  re->sub()[0] = sub;
278  re->min_ = min;
279  re->max_ = max;
280  return re;
281}
282
283Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
284  Regexp* re = new Regexp(kRegexpLiteral, flags);
285  re->rune_ = rune;
286  return re;
287}
288
289Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
290  if (nrunes <= 0)
291    return new Regexp(kRegexpEmptyMatch, flags);
292  if (nrunes == 1)
293    return NewLiteral(runes[0], flags);
294  Regexp* re = new Regexp(kRegexpLiteralString, flags);
295  for (int i = 0; i < nrunes; i++)
296    re->AddRuneToString(runes[i]);
297  return re;
298}
299
300Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
301  Regexp* re = new Regexp(kRegexpCharClass, flags);
302  re->cc_ = cc;
303  return re;
304}
305
306// Swaps this and that in place.
307void Regexp::Swap(Regexp* that) {
308  // Can use memmove because Regexp is just a struct (no vtable).
309  char tmp[sizeof *this];
310  memmove(tmp, this, sizeof tmp);
311  memmove(this, that, sizeof tmp);
312  memmove(that, tmp, sizeof tmp);
313}
314
315// Tests equality of all top-level structure but not subregexps.
316static bool TopEqual(Regexp* a, Regexp* b) {
317  if (a->op() != b->op())
318    return false;
319
320  switch (a->op()) {
321    case kRegexpNoMatch:
322    case kRegexpEmptyMatch:
323    case kRegexpAnyChar:
324    case kRegexpAnyByte:
325    case kRegexpBeginLine:
326    case kRegexpEndLine:
327    case kRegexpWordBoundary:
328    case kRegexpNoWordBoundary:
329    case kRegexpBeginText:
330      return true;
331
332    case kRegexpEndText:
333      // The parse flags remember whether it's \z or (?-m:$),
334      // which matters when testing against PCRE.
335      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
336
337    case kRegexpLiteral:
338      return a->rune() == b->rune() &&
339             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
340
341    case kRegexpLiteralString:
342      return a->nrunes() == b->nrunes() &&
343             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
344             memcmp(a->runes(), b->runes(),
345                    a->nrunes() * sizeof a->runes()[0]) == 0;
346
347    case kRegexpAlternate:
348    case kRegexpConcat:
349      return a->nsub() == b->nsub();
350
351    case kRegexpStar:
352    case kRegexpPlus:
353    case kRegexpQuest:
354      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
355
356    case kRegexpRepeat:
357      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
358             a->min() == b->min() &&
359             a->max() == b->max();
360
361    case kRegexpCapture:
362      return a->cap() == b->cap() && a->name() == b->name();
363
364    case kRegexpHaveMatch:
365      return a->match_id() == b->match_id();
366
367    case kRegexpCharClass: {
368      CharClass* acc = a->cc();
369      CharClass* bcc = b->cc();
370      return acc->size() == bcc->size() &&
371             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
372             memcmp(acc->begin(), bcc->begin(),
373                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
374    }
375  }
376
377  LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
378  return 0;
379}
380
381bool Regexp::Equal(Regexp* a, Regexp* b) {
382  if (a == NULL || b == NULL)
383    return a == b;
384
385  if (!TopEqual(a, b))
386    return false;
387
388  // Fast path:
389  // return without allocating vector if there are no subregexps.
390  switch (a->op()) {
391    case kRegexpAlternate:
392    case kRegexpConcat:
393    case kRegexpStar:
394    case kRegexpPlus:
395    case kRegexpQuest:
396    case kRegexpRepeat:
397    case kRegexpCapture:
398      break;
399
400    default:
401      return true;
402  }
403
404  // Committed to doing real work.
405  // The stack (vector) has pairs of regexps waiting to
406  // be compared.  The regexps are only equal if
407  // all the pairs end up being equal.
408  vector<Regexp*> stk;
409
410  for (;;) {
411    // Invariant: TopEqual(a, b) == true.
412    Regexp* a2;
413    Regexp* b2;
414    switch (a->op()) {
415      default:
416        break;
417      case kRegexpAlternate:
418      case kRegexpConcat:
419        for (int i = 0; i < a->nsub(); i++) {
420          a2 = a->sub()[i];
421          b2 = b->sub()[i];
422          if (!TopEqual(a2, b2))
423            return false;
424          stk.push_back(a2);
425          stk.push_back(b2);
426        }
427        break;
428
429      case kRegexpStar:
430      case kRegexpPlus:
431      case kRegexpQuest:
432      case kRegexpRepeat:
433      case kRegexpCapture:
434        a2 = a->sub()[0];
435        b2 = b->sub()[0];
436        if (!TopEqual(a2, b2))
437          return false;
438        // Really:
439        //   stk.push_back(a2);
440        //   stk.push_back(b2);
441        //   break;
442        // but faster to assign directly and loop.
443        a = a2;
444        b = b2;
445        continue;
446    }
447
448    int n = stk.size();
449    if (n == 0)
450      break;
451
452    a = stk[n-2];
453    b = stk[n-1];
454    stk.resize(n-2);
455  }
456
457  return true;
458}
459
460// Keep in sync with enum RegexpStatusCode in regexp.h
461static const char *kErrorStrings[] = {
462  "no error",
463  "unexpected error",
464  "invalid escape sequence",
465  "invalid character class",
466  "invalid character class range",
467  "missing ]",
468  "missing )",
469  "trailing \\",
470  "no argument for repetition operator",
471  "invalid repetition size",
472  "bad repetition operator",
473  "invalid perl operator",
474  "invalid UTF-8",
475  "invalid named capture group",
476};
477
478string RegexpStatus::CodeText(enum RegexpStatusCode code) {
479  if (code < 0 || code >= arraysize(kErrorStrings))
480    code = kRegexpInternalError;
481  return kErrorStrings[code];
482}
483
484string RegexpStatus::Text() const {
485  if (error_arg_.empty())
486    return CodeText(code_);
487  string s;
488  s.append(CodeText(code_));
489  s.append(": ");
490  s.append(error_arg_.data(), error_arg_.size());
491  return s;
492}
493
494void RegexpStatus::Copy(const RegexpStatus& status) {
495  code_ = status.code_;
496  error_arg_ = status.error_arg_;
497}
498
499typedef int Ignored;  // Walker<void> doesn't exist
500
501// Walker subclass to count capturing parens in regexp.
502class NumCapturesWalker : public Regexp::Walker<Ignored> {
503 public:
504  NumCapturesWalker() : ncapture_(0) {}
505  int ncapture() { return ncapture_; }
506
507  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
508    if (re->op() == kRegexpCapture)
509      ncapture_++;
510    return ignored;
511  }
512  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
513    // Should never be called: we use Walk not WalkExponential.
514    LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
515    return ignored;
516  }
517
518 private:
519  int ncapture_;
520  DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
521};
522
523int Regexp::NumCaptures() {
524  NumCapturesWalker w;
525  w.Walk(this, 0);
526  return w.ncapture();
527}
528
529// Walker class to build map of named capture groups and their indices.
530class NamedCapturesWalker : public Regexp::Walker<Ignored> {
531 public:
532  NamedCapturesWalker() : map_(NULL) {}
533  ~NamedCapturesWalker() { delete map_; }
534
535  map<string, int>* TakeMap() {
536    map<string, int>* m = map_;
537    map_ = NULL;
538    return m;
539  }
540
541  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
542    if (re->op() == kRegexpCapture && re->name() != NULL) {
543      // Allocate map once we find a name.
544      if (map_ == NULL)
545        map_ = new map<string, int>;
546
547      // Record first occurrence of each name.
548      // (The rule is that if you have the same name
549      // multiple times, only the leftmost one counts.)
550      if (map_->find(*re->name()) == map_->end())
551        (*map_)[*re->name()] = re->cap();
552    }
553    return ignored;
554  }
555
556  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
557    // Should never be called: we use Walk not WalkExponential.
558    LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
559    return ignored;
560  }
561
562 private:
563  map<string, int>* map_;
564  DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
565};
566
567map<string, int>* Regexp::NamedCaptures() {
568  NamedCapturesWalker w;
569  w.Walk(this, 0);
570  return w.TakeMap();
571}
572
573// Walker class to build map from capture group indices to their names.
574class CaptureNamesWalker : public Regexp::Walker<Ignored> {
575 public:
576  CaptureNamesWalker() : map_(NULL) {}
577  ~CaptureNamesWalker() { delete map_; }
578
579  map<int, string>* TakeMap() {
580    map<int, string>* m = map_;
581    map_ = NULL;
582    return m;
583  }
584
585  Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
586    if (re->op() == kRegexpCapture && re->name() != NULL) {
587      // Allocate map once we find a name.
588      if (map_ == NULL)
589        map_ = new map<int, string>;
590
591      (*map_)[re->cap()] = *re->name();
592    }
593    return ignored;
594  }
595
596  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
597    // Should never be called: we use Walk not WalkExponential.
598    LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
599    return ignored;
600  }
601
602 private:
603  map<int, string>* map_;
604  DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
605};
606
607map<int, string>* Regexp::CaptureNames() {
608  CaptureNamesWalker w;
609  w.Walk(this, 0);
610  return w.TakeMap();
611}
612
613// Determines whether regexp matches must be anchored
614// with a fixed string prefix.  If so, returns the prefix and
615// the regexp that remains after the prefix.  The prefix might
616// be ASCII case-insensitive.
617bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
618  // No need for a walker: the regexp must be of the form
619  // 1. some number of ^ anchors
620  // 2. a literal char or string
621  // 3. the rest
622  prefix->clear();
623  *foldcase = false;
624  *suffix = NULL;
625  if (op_ != kRegexpConcat)
626    return false;
627
628  // Some number of anchors, then a literal or concatenation.
629  int i = 0;
630  Regexp** sub = this->sub();
631  while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
632    i++;
633  if (i == 0 || i >= nsub_)
634    return false;
635
636  Regexp* re = sub[i];
637  switch (re->op_) {
638    default:
639      return false;
640
641    case kRegexpLiteralString:
642      // Convert to string in proper encoding.
643      if (re->parse_flags() & Latin1) {
644        prefix->resize(re->nrunes_);
645        for (int j = 0; j < re->nrunes_; j++)
646          (*prefix)[j] = re->runes_[j];
647      } else {
648        // Convert to UTF-8 in place.
649        // Assume worst-case space and then trim.
650        prefix->resize(re->nrunes_ * UTFmax);
651        char *p = &(*prefix)[0];
652        for (int j = 0; j < re->nrunes_; j++) {
653          Rune r = re->runes_[j];
654          if (r < Runeself)
655            *p++ = r;
656          else
657            p += runetochar(p, &r);
658        }
659        prefix->resize(p - &(*prefix)[0]);
660      }
661      break;
662
663    case kRegexpLiteral:
664      if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
665        prefix->append(1, re->rune_);
666      } else {
667        char buf[UTFmax];
668        prefix->append(buf, runetochar(buf, &re->rune_));
669      }
670      break;
671  }
672  *foldcase = (sub[i]->parse_flags() & FoldCase);
673  i++;
674
675  // The rest.
676  if (i < nsub_) {
677    for (int j = i; j < nsub_; j++)
678      sub[j]->Incref();
679    re = Concat(sub + i, nsub_ - i, parse_flags());
680  } else {
681    re = new Regexp(kRegexpEmptyMatch, parse_flags());
682  }
683  *suffix = re;
684  return true;
685}
686
687// Character class builder is a balanced binary tree (STL set)
688// containing non-overlapping, non-abutting RuneRanges.
689// The less-than operator used in the tree treats two
690// ranges as equal if they overlap at all, so that
691// lookups for a particular Rune are possible.
692
693CharClassBuilder::CharClassBuilder() {
694  nrunes_ = 0;
695  upper_ = 0;
696  lower_ = 0;
697}
698
699// Add lo-hi to the class; return whether class got bigger.
700bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
701  if (hi < lo)
702    return false;
703
704  if (lo <= 'z' && hi >= 'A') {
705    // Overlaps some alpha, maybe not all.
706    // Update bitmaps telling which ASCII letters are in the set.
707    Rune lo1 = max<Rune>(lo, 'A');
708    Rune hi1 = min<Rune>(hi, 'Z');
709    if (lo1 <= hi1)
710      upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
711
712    lo1 = max<Rune>(lo, 'a');
713    hi1 = min<Rune>(hi, 'z');
714    if (lo1 <= hi1)
715      lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
716  }
717
718  {  // Check whether lo, hi is already in the class.
719    iterator it = ranges_.find(RuneRange(lo, lo));
720    if (it != end() && it->lo <= lo && hi <= it->hi)
721      return false;
722  }
723
724  // Look for a range abutting lo on the left.
725  // If it exists, take it out and increase our range.
726  if (lo > 0) {
727    iterator it = ranges_.find(RuneRange(lo-1, lo-1));
728    if (it != end()) {
729      lo = it->lo;
730      if (it->hi > hi)
731        hi = it->hi;
732      nrunes_ -= it->hi - it->lo + 1;
733      ranges_.erase(it);
734    }
735  }
736
737  // Look for a range abutting hi on the right.
738  // If it exists, take it out and increase our range.
739  if (hi < Runemax) {
740    iterator it = ranges_.find(RuneRange(hi+1, hi+1));
741    if (it != end()) {
742      hi = it->hi;
743      nrunes_ -= it->hi - it->lo + 1;
744      ranges_.erase(it);
745    }
746  }
747
748  // Look for ranges between lo and hi.  Take them out.
749  // This is only safe because the set has no overlapping ranges.
750  // We've already removed any ranges abutting lo and hi, so
751  // any that overlap [lo, hi] must be contained within it.
752  for (;;) {
753    iterator it = ranges_.find(RuneRange(lo, hi));
754    if (it == end())
755      break;
756    nrunes_ -= it->hi - it->lo + 1;
757    ranges_.erase(it);
758  }
759
760  // Finally, add [lo, hi].
761  nrunes_ += hi - lo + 1;
762  ranges_.insert(RuneRange(lo, hi));
763  return true;
764}
765
766void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
767  for (iterator it = cc->begin(); it != cc->end(); ++it)
768    AddRange(it->lo, it->hi);
769}
770
771bool CharClassBuilder::Contains(Rune r) {
772  return ranges_.find(RuneRange(r, r)) != end();
773}
774
775// Does the character class behave the same on A-Z as on a-z?
776bool CharClassBuilder::FoldsASCII() {
777  return ((upper_ ^ lower_) & AlphaMask) == 0;
778}
779
780CharClassBuilder* CharClassBuilder::Copy() {
781  CharClassBuilder* cc = new CharClassBuilder;
782  for (iterator it = begin(); it != end(); ++it)
783    cc->ranges_.insert(RuneRange(it->lo, it->hi));
784  cc->upper_ = upper_;
785  cc->lower_ = lower_;
786  cc->nrunes_ = nrunes_;
787  return cc;
788}
789
790
791
792void CharClassBuilder::RemoveAbove(Rune r) {
793  if (r >= Runemax)
794    return;
795
796  if (r < 'z') {
797    if (r < 'a')
798      lower_ = 0;
799    else
800      lower_ &= AlphaMask >> ('z' - r);
801  }
802
803  if (r < 'Z') {
804    if (r < 'A')
805      upper_ = 0;
806    else
807      upper_ &= AlphaMask >> ('Z' - r);
808  }
809
810  for (;;) {
811
812    iterator it = ranges_.find(RuneRange(r + 1, Runemax));
813    if (it == end())
814      break;
815    RuneRange rr = *it;
816    ranges_.erase(it);
817    nrunes_ -= rr.hi - rr.lo + 1;
818    if (rr.lo <= r) {
819      rr.hi = r;
820      ranges_.insert(rr);
821      nrunes_ += rr.hi - rr.lo + 1;
822    }
823  }
824}
825
826void CharClassBuilder::Negate() {
827  // Build up negation and then copy in.
828  // Could edit ranges in place, but C++ won't let me.
829  vector<RuneRange> v;
830  v.reserve(ranges_.size() + 1);
831
832  // In negation, first range begins at 0, unless
833  // the current class begins at 0.
834  iterator it = begin();
835  if (it == end()) {
836    v.push_back(RuneRange(0, Runemax));
837  } else {
838    int nextlo = 0;
839    if (it->lo == 0) {
840      nextlo = it->hi + 1;
841      ++it;
842    }
843    for (; it != end(); ++it) {
844      v.push_back(RuneRange(nextlo, it->lo - 1));
845      nextlo = it->hi + 1;
846    }
847    if (nextlo <= Runemax)
848      v.push_back(RuneRange(nextlo, Runemax));
849  }
850
851  ranges_.clear();
852  for (int i = 0; i < v.size(); i++)
853    ranges_.insert(v[i]);
854
855  upper_ = AlphaMask & ~upper_;
856  lower_ = AlphaMask & ~lower_;
857  nrunes_ = Runemax+1 - nrunes_;
858}
859
860// Character class is a sorted list of ranges.
861// The ranges are allocated in the same block as the header,
862// necessitating a special allocator and Delete method.
863
864CharClass* CharClass::New(int maxranges) {
865  CharClass* cc;
866  uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
867  cc = reinterpret_cast<CharClass*>(data);
868  cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
869  cc->nranges_ = 0;
870  cc->folds_ascii_ = false;
871  cc->nrunes_ = 0;
872  return cc;
873}
874
875void CharClass::Delete() {
876  uint8 *data = reinterpret_cast<uint8*>(this);
877  delete[] data;
878}
879
880CharClass* CharClass::Negate() {
881  CharClass* cc = CharClass::New(nranges_+1);
882  cc->folds_ascii_ = folds_ascii_;
883  cc->nrunes_ = Runemax + 1 - nrunes_;
884  int n = 0;
885  int nextlo = 0;
886  for (CharClass::iterator it = begin(); it != end(); ++it) {
887    if (it->lo == nextlo) {
888      nextlo = it->hi + 1;
889    } else {
890      cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
891      nextlo = it->hi + 1;
892    }
893  }
894  if (nextlo <= Runemax)
895    cc->ranges_[n++] = RuneRange(nextlo, Runemax);
896  cc->nranges_ = n;
897  return cc;
898}
899
900bool CharClass::Contains(Rune r) {
901  RuneRange* rr = ranges_;
902  int n = nranges_;
903  while (n > 0) {
904    int m = n/2;
905    if (rr[m].hi < r) {
906      rr += m+1;
907      n -= m+1;
908    } else if (r < rr[m].lo) {
909      n = m;
910    } else {  // rr[m].lo <= r && r <= rr[m].hi
911      return true;
912    }
913  }
914  return false;
915}
916
917CharClass* CharClassBuilder::GetCharClass() {
918  CharClass* cc = CharClass::New(ranges_.size());
919  int n = 0;
920  for (iterator it = begin(); it != end(); ++it)
921    cc->ranges_[n++] = *it;
922  cc->nranges_ = n;
923  DCHECK_LE(n, ranges_.size());
924  cc->nrunes_ = nrunes_;
925  cc->folds_ascii_ = FoldsASCII();
926  return cc;
927}
928
929}  // namespace re2
930