1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/stringpiece.h"
11 #include "re2/walker-inl.h"
12 
13 namespace re2 {
14 
15 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17   : op_(op),
18     simple_(false),
19     parse_flags_(static_cast<uint16>(parse_flags)),
20     ref_(1),
21     nsub_(0),
22     down_(NULL) {
23   subone_ = NULL;
24   memset(the_union_, 0, sizeof the_union_);
25 }
26 
27 // Destructor.  Assumes already cleaned up children.
28 // Private: use Decref() instead of delete to destroy Regexps.
29 // Can't call Decref on the sub-Regexps here because
30 // that could cause arbitrarily deep recursion, so
31 // required Decref() to have handled them for us.
~Regexp()32 Regexp::~Regexp() {
33   if (nsub_ > 0)
34     LOG(DFATAL) << "Regexp not destroyed.";
35 
36   switch (op_) {
37     default:
38       break;
39     case kRegexpCapture:
40       delete name_;
41       break;
42     case kRegexpLiteralString:
43       delete[] runes_;
44       break;
45     case kRegexpCharClass:
46       cc_->Delete();
47       delete ccb_;
48       break;
49   }
50 }
51 
52 // If it's possible to destroy this regexp without recurring,
53 // do so and return true.  Else return false.
QuickDestroy()54 bool Regexp::QuickDestroy() {
55   if (nsub_ == 0) {
56     delete this;
57     return true;
58   }
59   return false;
60 }
61 
62 static map<Regexp*, int> *ref_map;
63 GLOBAL_MUTEX(ref_mutex);
64 
Ref()65 int Regexp::Ref() {
66   if (ref_ < kMaxRef)
67     return ref_;
68 
69   GLOBAL_MUTEX_LOCK(ref_mutex);
70   int r = 0;
71   if (ref_map != NULL) {
72     r = (*ref_map)[this];
73   }
74   GLOBAL_MUTEX_UNLOCK(ref_mutex);
75   return r;
76 }
77 
78 // Increments reference count, returns object as convenience.
Incref()79 Regexp* Regexp::Incref() {
80   if (ref_ >= kMaxRef-1) {
81     // Store ref count in overflow map.
82     GLOBAL_MUTEX_LOCK(ref_mutex);
83     if (ref_map == NULL) {
84       ref_map = new map<Regexp*, int>;
85     }
86     if (ref_ == kMaxRef) {
87       // already overflowed
88       (*ref_map)[this]++;
89     } else {
90       // overflowing now
91       (*ref_map)[this] = kMaxRef;
92       ref_ = kMaxRef;
93     }
94     GLOBAL_MUTEX_UNLOCK(ref_mutex);
95     return this;
96   }
97 
98   ref_++;
99   return this;
100 }
101 
102 // Decrements reference count and deletes this object if count reaches 0.
Decref()103 void Regexp::Decref() {
104   if (ref_ == kMaxRef) {
105     // Ref count is stored in overflow map.
106     GLOBAL_MUTEX_LOCK(ref_mutex);
107     int r = (*ref_map)[this] - 1;
108     if (r < kMaxRef) {
109       ref_ = r;
110       ref_map->erase(this);
111     } else {
112       (*ref_map)[this] = r;
113     }
114     GLOBAL_MUTEX_UNLOCK(ref_mutex);
115     return;
116   }
117   ref_--;
118   if (ref_ == 0)
119     Destroy();
120 }
121 
122 // Deletes this object; ref count has count reached 0.
Destroy()123 void Regexp::Destroy() {
124   if (QuickDestroy())
125     return;
126 
127   // Handle recursive Destroy with explicit stack
128   // to avoid arbitrarily deep recursion on process stack [sigh].
129   down_ = NULL;
130   Regexp* stack = this;
131   while (stack != NULL) {
132     Regexp* re = stack;
133     stack = re->down_;
134     if (re->ref_ != 0)
135       LOG(DFATAL) << "Bad reference count " << re->ref_;
136     if (re->nsub_ > 0) {
137       Regexp** subs = re->sub();
138       for (int i = 0; i < re->nsub_; i++) {
139         Regexp* sub = subs[i];
140         if (sub == NULL)
141           continue;
142         if (sub->ref_ == kMaxRef)
143           sub->Decref();
144         else
145           --sub->ref_;
146         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
147           sub->down_ = stack;
148           stack = sub;
149         }
150       }
151       if (re->nsub_ > 1)
152         delete[] subs;
153       re->nsub_ = 0;
154     }
155     delete re;
156   }
157 }
158 
AddRuneToString(Rune r)159 void Regexp::AddRuneToString(Rune r) {
160   DCHECK(op_ == kRegexpLiteralString);
161   if (nrunes_ == 0) {
162     // start with 8
163     runes_ = new Rune[8];
164   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
165     // double on powers of two
166     Rune *old = runes_;
167     runes_ = new Rune[nrunes_ * 2];
168     for (int i = 0; i < nrunes_; i++)
169       runes_[i] = old[i];
170     delete[] old;
171   }
172 
173   runes_[nrunes_++] = r;
174 }
175 
HaveMatch(int match_id,ParseFlags flags)176 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
177   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
178   re->match_id_ = match_id;
179   return re;
180 }
181 
Plus(Regexp * sub,ParseFlags flags)182 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
183   if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
184     return sub;
185   Regexp* re = new Regexp(kRegexpPlus, flags);
186   re->AllocSub(1);
187   re->sub()[0] = sub;
188   return re;
189 }
190 
Star(Regexp * sub,ParseFlags flags)191 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
192   if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
193     return sub;
194   Regexp* re = new Regexp(kRegexpStar, flags);
195   re->AllocSub(1);
196   re->sub()[0] = sub;
197   return re;
198 }
199 
Quest(Regexp * sub,ParseFlags flags)200 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
201   if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
202     return sub;
203   Regexp* re = new Regexp(kRegexpQuest, flags);
204   re->AllocSub(1);
205   re->sub()[0] = sub;
206   return re;
207 }
208 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)209 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
210                                   ParseFlags flags, bool can_factor) {
211   if (nsub == 1)
212     return sub[0];
213 
214   Regexp** subcopy = NULL;
215   if (op == kRegexpAlternate && can_factor) {
216     // Going to edit sub; make a copy so we don't step on caller.
217     subcopy = new Regexp*[nsub];
218     memmove(subcopy, sub, nsub * sizeof sub[0]);
219     sub = subcopy;
220     nsub = FactorAlternation(sub, nsub, flags);
221     if (nsub == 1) {
222       Regexp* re = sub[0];
223       delete[] subcopy;
224       return re;
225     }
226   }
227 
228   if (nsub > kMaxNsub) {
229     // Too many subexpressions to fit in a single Regexp.
230     // Make a two-level tree.  Two levels gets us to 65535^2.
231     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
232     Regexp* re = new Regexp(op, flags);
233     re->AllocSub(nbigsub);
234     Regexp** subs = re->sub();
235     for (int i = 0; i < nbigsub - 1; i++)
236       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
237     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
238                                           nsub - (nbigsub-1)*kMaxNsub, flags,
239                                           false);
240     delete[] subcopy;
241     return re;
242   }
243 
244   Regexp* re = new Regexp(op, flags);
245   re->AllocSub(nsub);
246   Regexp** subs = re->sub();
247   for (int i = 0; i < nsub; i++)
248     subs[i] = sub[i];
249 
250   delete[] subcopy;
251   return re;
252 }
253 
Concat(Regexp ** sub,int nsub,ParseFlags flags)254 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
255   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
256 }
257 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)258 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
259   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
260 }
261 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)262 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
263   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
264 }
265 
Capture(Regexp * sub,ParseFlags flags,int cap)266 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
267   Regexp* re = new Regexp(kRegexpCapture, flags);
268   re->AllocSub(1);
269   re->sub()[0] = sub;
270   re->cap_ = cap;
271   return re;
272 }
273 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)274 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
275   Regexp* re = new Regexp(kRegexpRepeat, flags);
276   re->AllocSub(1);
277   re->sub()[0] = sub;
278   re->min_ = min;
279   re->max_ = max;
280   return re;
281 }
282 
NewLiteral(Rune rune,ParseFlags flags)283 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
284   Regexp* re = new Regexp(kRegexpLiteral, flags);
285   re->rune_ = rune;
286   return re;
287 }
288 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)289 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
290   if (nrunes <= 0)
291     return new Regexp(kRegexpEmptyMatch, flags);
292   if (nrunes == 1)
293     return NewLiteral(runes[0], flags);
294   Regexp* re = new Regexp(kRegexpLiteralString, flags);
295   for (int i = 0; i < nrunes; i++)
296     re->AddRuneToString(runes[i]);
297   return re;
298 }
299 
NewCharClass(CharClass * cc,ParseFlags flags)300 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
301   Regexp* re = new Regexp(kRegexpCharClass, flags);
302   re->cc_ = cc;
303   return re;
304 }
305 
306 // Swaps this and that in place.
Swap(Regexp * that)307 void Regexp::Swap(Regexp* that) {
308   // Can use memmove because Regexp is just a struct (no vtable).
309   char tmp[sizeof *this];
310   memmove(tmp, this, sizeof tmp);
311   memmove(this, that, sizeof tmp);
312   memmove(that, tmp, sizeof tmp);
313 }
314 
315 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)316 static bool TopEqual(Regexp* a, Regexp* b) {
317   if (a->op() != b->op())
318     return false;
319 
320   switch (a->op()) {
321     case kRegexpNoMatch:
322     case kRegexpEmptyMatch:
323     case kRegexpAnyChar:
324     case kRegexpAnyByte:
325     case kRegexpBeginLine:
326     case kRegexpEndLine:
327     case kRegexpWordBoundary:
328     case kRegexpNoWordBoundary:
329     case kRegexpBeginText:
330       return true;
331 
332     case kRegexpEndText:
333       // The parse flags remember whether it's \z or (?-m:$),
334       // which matters when testing against PCRE.
335       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
336 
337     case kRegexpLiteral:
338       return a->rune() == b->rune() &&
339              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
340 
341     case kRegexpLiteralString:
342       return a->nrunes() == b->nrunes() &&
343              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
344              memcmp(a->runes(), b->runes(),
345                     a->nrunes() * sizeof a->runes()[0]) == 0;
346 
347     case kRegexpAlternate:
348     case kRegexpConcat:
349       return a->nsub() == b->nsub();
350 
351     case kRegexpStar:
352     case kRegexpPlus:
353     case kRegexpQuest:
354       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
355 
356     case kRegexpRepeat:
357       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
358              a->min() == b->min() &&
359              a->max() == b->max();
360 
361     case kRegexpCapture:
362       return a->cap() == b->cap() && a->name() == b->name();
363 
364     case kRegexpHaveMatch:
365       return a->match_id() == b->match_id();
366 
367     case kRegexpCharClass: {
368       CharClass* acc = a->cc();
369       CharClass* bcc = b->cc();
370       return acc->size() == bcc->size() &&
371              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
372              memcmp(acc->begin(), bcc->begin(),
373                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
374     }
375   }
376 
377   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
378   return 0;
379 }
380 
Equal(Regexp * a,Regexp * b)381 bool Regexp::Equal(Regexp* a, Regexp* b) {
382   if (a == NULL || b == NULL)
383     return a == b;
384 
385   if (!TopEqual(a, b))
386     return false;
387 
388   // Fast path:
389   // return without allocating vector if there are no subregexps.
390   switch (a->op()) {
391     case kRegexpAlternate:
392     case kRegexpConcat:
393     case kRegexpStar:
394     case kRegexpPlus:
395     case kRegexpQuest:
396     case kRegexpRepeat:
397     case kRegexpCapture:
398       break;
399 
400     default:
401       return true;
402   }
403 
404   // Committed to doing real work.
405   // The stack (vector) has pairs of regexps waiting to
406   // be compared.  The regexps are only equal if
407   // all the pairs end up being equal.
408   vector<Regexp*> stk;
409 
410   for (;;) {
411     // Invariant: TopEqual(a, b) == true.
412     Regexp* a2;
413     Regexp* b2;
414     switch (a->op()) {
415       default:
416         break;
417       case kRegexpAlternate:
418       case kRegexpConcat:
419         for (int i = 0; i < a->nsub(); i++) {
420           a2 = a->sub()[i];
421           b2 = b->sub()[i];
422           if (!TopEqual(a2, b2))
423             return false;
424           stk.push_back(a2);
425           stk.push_back(b2);
426         }
427         break;
428 
429       case kRegexpStar:
430       case kRegexpPlus:
431       case kRegexpQuest:
432       case kRegexpRepeat:
433       case kRegexpCapture:
434         a2 = a->sub()[0];
435         b2 = b->sub()[0];
436         if (!TopEqual(a2, b2))
437           return false;
438         // Really:
439         //   stk.push_back(a2);
440         //   stk.push_back(b2);
441         //   break;
442         // but faster to assign directly and loop.
443         a = a2;
444         b = b2;
445         continue;
446     }
447 
448     int n = stk.size();
449     if (n == 0)
450       break;
451 
452     a = stk[n-2];
453     b = stk[n-1];
454     stk.resize(n-2);
455   }
456 
457   return true;
458 }
459 
460 // Keep in sync with enum RegexpStatusCode in regexp.h
461 static const char *kErrorStrings[] = {
462   "no error",
463   "unexpected error",
464   "invalid escape sequence",
465   "invalid character class",
466   "invalid character class range",
467   "missing ]",
468   "missing )",
469   "trailing \\",
470   "no argument for repetition operator",
471   "invalid repetition size",
472   "bad repetition operator",
473   "invalid perl operator",
474   "invalid UTF-8",
475   "invalid named capture group",
476 };
477 
CodeText(enum RegexpStatusCode code)478 string RegexpStatus::CodeText(enum RegexpStatusCode code) {
479   if (code < 0 || code >= arraysize(kErrorStrings))
480     code = kRegexpInternalError;
481   return kErrorStrings[code];
482 }
483 
Text() const484 string RegexpStatus::Text() const {
485   if (error_arg_.empty())
486     return CodeText(code_);
487   string s;
488   s.append(CodeText(code_));
489   s.append(": ");
490   s.append(error_arg_.data(), error_arg_.size());
491   return s;
492 }
493 
Copy(const RegexpStatus & status)494 void RegexpStatus::Copy(const RegexpStatus& status) {
495   code_ = status.code_;
496   error_arg_ = status.error_arg_;
497 }
498 
499 typedef int Ignored;  // Walker<void> doesn't exist
500 
501 // Walker subclass to count capturing parens in regexp.
502 class NumCapturesWalker : public Regexp::Walker<Ignored> {
503  public:
NumCapturesWalker()504   NumCapturesWalker() : ncapture_(0) {}
ncapture()505   int ncapture() { return ncapture_; }
506 
PreVisit(Regexp * re,Ignored ignored,bool * stop)507   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
508     if (re->op() == kRegexpCapture)
509       ncapture_++;
510     return ignored;
511   }
ShortVisit(Regexp * re,Ignored ignored)512   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
513     // Should never be called: we use Walk not WalkExponential.
514     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
515     return ignored;
516   }
517 
518  private:
519   int ncapture_;
520   DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
521 };
522 
NumCaptures()523 int Regexp::NumCaptures() {
524   NumCapturesWalker w;
525   w.Walk(this, 0);
526   return w.ncapture();
527 }
528 
529 // Walker class to build map of named capture groups and their indices.
530 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
531  public:
NamedCapturesWalker()532   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()533   ~NamedCapturesWalker() { delete map_; }
534 
TakeMap()535   map<string, int>* TakeMap() {
536     map<string, int>* m = map_;
537     map_ = NULL;
538     return m;
539   }
540 
PreVisit(Regexp * re,Ignored ignored,bool * stop)541   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
542     if (re->op() == kRegexpCapture && re->name() != NULL) {
543       // Allocate map once we find a name.
544       if (map_ == NULL)
545         map_ = new map<string, int>;
546 
547       // Record first occurrence of each name.
548       // (The rule is that if you have the same name
549       // multiple times, only the leftmost one counts.)
550       if (map_->find(*re->name()) == map_->end())
551         (*map_)[*re->name()] = re->cap();
552     }
553     return ignored;
554   }
555 
ShortVisit(Regexp * re,Ignored ignored)556   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
557     // Should never be called: we use Walk not WalkExponential.
558     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
559     return ignored;
560   }
561 
562  private:
563   map<string, int>* map_;
564   DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
565 };
566 
NamedCaptures()567 map<string, int>* Regexp::NamedCaptures() {
568   NamedCapturesWalker w;
569   w.Walk(this, 0);
570   return w.TakeMap();
571 }
572 
573 // Walker class to build map from capture group indices to their names.
574 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
575  public:
CaptureNamesWalker()576   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()577   ~CaptureNamesWalker() { delete map_; }
578 
TakeMap()579   map<int, string>* TakeMap() {
580     map<int, string>* m = map_;
581     map_ = NULL;
582     return m;
583   }
584 
PreVisit(Regexp * re,Ignored ignored,bool * stop)585   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
586     if (re->op() == kRegexpCapture && re->name() != NULL) {
587       // Allocate map once we find a name.
588       if (map_ == NULL)
589         map_ = new map<int, string>;
590 
591       (*map_)[re->cap()] = *re->name();
592     }
593     return ignored;
594   }
595 
ShortVisit(Regexp * re,Ignored ignored)596   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
597     // Should never be called: we use Walk not WalkExponential.
598     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
599     return ignored;
600   }
601 
602  private:
603   map<int, string>* map_;
604   DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
605 };
606 
CaptureNames()607 map<int, string>* Regexp::CaptureNames() {
608   CaptureNamesWalker w;
609   w.Walk(this, 0);
610   return w.TakeMap();
611 }
612 
613 // Determines whether regexp matches must be anchored
614 // with a fixed string prefix.  If so, returns the prefix and
615 // the regexp that remains after the prefix.  The prefix might
616 // be ASCII case-insensitive.
RequiredPrefix(string * prefix,bool * foldcase,Regexp ** suffix)617 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
618   // No need for a walker: the regexp must be of the form
619   // 1. some number of ^ anchors
620   // 2. a literal char or string
621   // 3. the rest
622   prefix->clear();
623   *foldcase = false;
624   *suffix = NULL;
625   if (op_ != kRegexpConcat)
626     return false;
627 
628   // Some number of anchors, then a literal or concatenation.
629   int i = 0;
630   Regexp** sub = this->sub();
631   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
632     i++;
633   if (i == 0 || i >= nsub_)
634     return false;
635 
636   Regexp* re = sub[i];
637   switch (re->op_) {
638     default:
639       return false;
640 
641     case kRegexpLiteralString:
642       // Convert to string in proper encoding.
643       if (re->parse_flags() & Latin1) {
644         prefix->resize(re->nrunes_);
645         for (int j = 0; j < re->nrunes_; j++)
646           (*prefix)[j] = re->runes_[j];
647       } else {
648         // Convert to UTF-8 in place.
649         // Assume worst-case space and then trim.
650         prefix->resize(re->nrunes_ * UTFmax);
651         char *p = &(*prefix)[0];
652         for (int j = 0; j < re->nrunes_; j++) {
653           Rune r = re->runes_[j];
654           if (r < Runeself)
655             *p++ = r;
656           else
657             p += runetochar(p, &r);
658         }
659         prefix->resize(p - &(*prefix)[0]);
660       }
661       break;
662 
663     case kRegexpLiteral:
664       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
665         prefix->append(1, re->rune_);
666       } else {
667         char buf[UTFmax];
668         prefix->append(buf, runetochar(buf, &re->rune_));
669       }
670       break;
671   }
672   *foldcase = (sub[i]->parse_flags() & FoldCase);
673   i++;
674 
675   // The rest.
676   if (i < nsub_) {
677     for (int j = i; j < nsub_; j++)
678       sub[j]->Incref();
679     re = Concat(sub + i, nsub_ - i, parse_flags());
680   } else {
681     re = new Regexp(kRegexpEmptyMatch, parse_flags());
682   }
683   *suffix = re;
684   return true;
685 }
686 
687 // Character class builder is a balanced binary tree (STL set)
688 // containing non-overlapping, non-abutting RuneRanges.
689 // The less-than operator used in the tree treats two
690 // ranges as equal if they overlap at all, so that
691 // lookups for a particular Rune are possible.
692 
CharClassBuilder()693 CharClassBuilder::CharClassBuilder() {
694   nrunes_ = 0;
695   upper_ = 0;
696   lower_ = 0;
697 }
698 
699 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)700 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
701   if (hi < lo)
702     return false;
703 
704   if (lo <= 'z' && hi >= 'A') {
705     // Overlaps some alpha, maybe not all.
706     // Update bitmaps telling which ASCII letters are in the set.
707     Rune lo1 = max<Rune>(lo, 'A');
708     Rune hi1 = min<Rune>(hi, 'Z');
709     if (lo1 <= hi1)
710       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
711 
712     lo1 = max<Rune>(lo, 'a');
713     hi1 = min<Rune>(hi, 'z');
714     if (lo1 <= hi1)
715       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
716   }
717 
718   {  // Check whether lo, hi is already in the class.
719     iterator it = ranges_.find(RuneRange(lo, lo));
720     if (it != end() && it->lo <= lo && hi <= it->hi)
721       return false;
722   }
723 
724   // Look for a range abutting lo on the left.
725   // If it exists, take it out and increase our range.
726   if (lo > 0) {
727     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
728     if (it != end()) {
729       lo = it->lo;
730       if (it->hi > hi)
731         hi = it->hi;
732       nrunes_ -= it->hi - it->lo + 1;
733       ranges_.erase(it);
734     }
735   }
736 
737   // Look for a range abutting hi on the right.
738   // If it exists, take it out and increase our range.
739   if (hi < Runemax) {
740     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
741     if (it != end()) {
742       hi = it->hi;
743       nrunes_ -= it->hi - it->lo + 1;
744       ranges_.erase(it);
745     }
746   }
747 
748   // Look for ranges between lo and hi.  Take them out.
749   // This is only safe because the set has no overlapping ranges.
750   // We've already removed any ranges abutting lo and hi, so
751   // any that overlap [lo, hi] must be contained within it.
752   for (;;) {
753     iterator it = ranges_.find(RuneRange(lo, hi));
754     if (it == end())
755       break;
756     nrunes_ -= it->hi - it->lo + 1;
757     ranges_.erase(it);
758   }
759 
760   // Finally, add [lo, hi].
761   nrunes_ += hi - lo + 1;
762   ranges_.insert(RuneRange(lo, hi));
763   return true;
764 }
765 
AddCharClass(CharClassBuilder * cc)766 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
767   for (iterator it = cc->begin(); it != cc->end(); ++it)
768     AddRange(it->lo, it->hi);
769 }
770 
Contains(Rune r)771 bool CharClassBuilder::Contains(Rune r) {
772   return ranges_.find(RuneRange(r, r)) != end();
773 }
774 
775 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()776 bool CharClassBuilder::FoldsASCII() {
777   return ((upper_ ^ lower_) & AlphaMask) == 0;
778 }
779 
Copy()780 CharClassBuilder* CharClassBuilder::Copy() {
781   CharClassBuilder* cc = new CharClassBuilder;
782   for (iterator it = begin(); it != end(); ++it)
783     cc->ranges_.insert(RuneRange(it->lo, it->hi));
784   cc->upper_ = upper_;
785   cc->lower_ = lower_;
786   cc->nrunes_ = nrunes_;
787   return cc;
788 }
789 
790 
791 
RemoveAbove(Rune r)792 void CharClassBuilder::RemoveAbove(Rune r) {
793   if (r >= Runemax)
794     return;
795 
796   if (r < 'z') {
797     if (r < 'a')
798       lower_ = 0;
799     else
800       lower_ &= AlphaMask >> ('z' - r);
801   }
802 
803   if (r < 'Z') {
804     if (r < 'A')
805       upper_ = 0;
806     else
807       upper_ &= AlphaMask >> ('Z' - r);
808   }
809 
810   for (;;) {
811 
812     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
813     if (it == end())
814       break;
815     RuneRange rr = *it;
816     ranges_.erase(it);
817     nrunes_ -= rr.hi - rr.lo + 1;
818     if (rr.lo <= r) {
819       rr.hi = r;
820       ranges_.insert(rr);
821       nrunes_ += rr.hi - rr.lo + 1;
822     }
823   }
824 }
825 
Negate()826 void CharClassBuilder::Negate() {
827   // Build up negation and then copy in.
828   // Could edit ranges in place, but C++ won't let me.
829   vector<RuneRange> v;
830   v.reserve(ranges_.size() + 1);
831 
832   // In negation, first range begins at 0, unless
833   // the current class begins at 0.
834   iterator it = begin();
835   if (it == end()) {
836     v.push_back(RuneRange(0, Runemax));
837   } else {
838     int nextlo = 0;
839     if (it->lo == 0) {
840       nextlo = it->hi + 1;
841       ++it;
842     }
843     for (; it != end(); ++it) {
844       v.push_back(RuneRange(nextlo, it->lo - 1));
845       nextlo = it->hi + 1;
846     }
847     if (nextlo <= Runemax)
848       v.push_back(RuneRange(nextlo, Runemax));
849   }
850 
851   ranges_.clear();
852   for (int i = 0; i < v.size(); i++)
853     ranges_.insert(v[i]);
854 
855   upper_ = AlphaMask & ~upper_;
856   lower_ = AlphaMask & ~lower_;
857   nrunes_ = Runemax+1 - nrunes_;
858 }
859 
860 // Character class is a sorted list of ranges.
861 // The ranges are allocated in the same block as the header,
862 // necessitating a special allocator and Delete method.
863 
New(int maxranges)864 CharClass* CharClass::New(int maxranges) {
865   CharClass* cc;
866   uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
867   cc = reinterpret_cast<CharClass*>(data);
868   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
869   cc->nranges_ = 0;
870   cc->folds_ascii_ = false;
871   cc->nrunes_ = 0;
872   return cc;
873 }
874 
Delete()875 void CharClass::Delete() {
876   if (this == NULL)
877     return;
878   uint8 *data = reinterpret_cast<uint8*>(this);
879   delete[] data;
880 }
881 
Negate()882 CharClass* CharClass::Negate() {
883   CharClass* cc = CharClass::New(nranges_+1);
884   cc->folds_ascii_ = folds_ascii_;
885   cc->nrunes_ = Runemax + 1 - nrunes_;
886   int n = 0;
887   int nextlo = 0;
888   for (CharClass::iterator it = begin(); it != end(); ++it) {
889     if (it->lo == nextlo) {
890       nextlo = it->hi + 1;
891     } else {
892       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
893       nextlo = it->hi + 1;
894     }
895   }
896   if (nextlo <= Runemax)
897     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
898   cc->nranges_ = n;
899   return cc;
900 }
901 
Contains(Rune r)902 bool CharClass::Contains(Rune r) {
903   RuneRange* rr = ranges_;
904   int n = nranges_;
905   while (n > 0) {
906     int m = n/2;
907     if (rr[m].hi < r) {
908       rr += m+1;
909       n -= m+1;
910     } else if (r < rr[m].lo) {
911       n = m;
912     } else {  // rr[m].lo <= r && r <= rr[m].hi
913       return true;
914     }
915   }
916   return false;
917 }
918 
GetCharClass()919 CharClass* CharClassBuilder::GetCharClass() {
920   CharClass* cc = CharClass::New(ranges_.size());
921   int n = 0;
922   for (iterator it = begin(); it != end(); ++it)
923     cc->ranges_[n++] = *it;
924   cc->nranges_ = n;
925   DCHECK_LE(n, ranges_.size());
926   cc->nrunes_ = nrunes_;
927   cc->folds_ascii_ = FoldsASCII();
928   return cc;
929 }
930 
931 }  // namespace re2
932