1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/stringpiece.h"
11 #include "re2/walker-inl.h"
12
13 namespace re2 {
14
15 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17 : op_(op),
18 simple_(false),
19 parse_flags_(static_cast<uint16>(parse_flags)),
20 ref_(1),
21 nsub_(0),
22 down_(NULL) {
23 subone_ = NULL;
24 memset(the_union_, 0, sizeof the_union_);
25 }
26
27 // Destructor. Assumes already cleaned up children.
28 // Private: use Decref() instead of delete to destroy Regexps.
29 // Can't call Decref on the sub-Regexps here because
30 // that could cause arbitrarily deep recursion, so
31 // required Decref() to have handled them for us.
~Regexp()32 Regexp::~Regexp() {
33 if (nsub_ > 0)
34 LOG(DFATAL) << "Regexp not destroyed.";
35
36 switch (op_) {
37 default:
38 break;
39 case kRegexpCapture:
40 delete name_;
41 break;
42 case kRegexpLiteralString:
43 delete[] runes_;
44 break;
45 case kRegexpCharClass:
46 cc_->Delete();
47 delete ccb_;
48 break;
49 }
50 }
51
52 // If it's possible to destroy this regexp without recurring,
53 // do so and return true. Else return false.
QuickDestroy()54 bool Regexp::QuickDestroy() {
55 if (nsub_ == 0) {
56 delete this;
57 return true;
58 }
59 return false;
60 }
61
62 static map<Regexp*, int> *ref_map;
63 GLOBAL_MUTEX(ref_mutex);
64
Ref()65 int Regexp::Ref() {
66 if (ref_ < kMaxRef)
67 return ref_;
68
69 GLOBAL_MUTEX_LOCK(ref_mutex);
70 int r = 0;
71 if (ref_map != NULL) {
72 r = (*ref_map)[this];
73 }
74 GLOBAL_MUTEX_UNLOCK(ref_mutex);
75 return r;
76 }
77
78 // Increments reference count, returns object as convenience.
Incref()79 Regexp* Regexp::Incref() {
80 if (ref_ >= kMaxRef-1) {
81 // Store ref count in overflow map.
82 GLOBAL_MUTEX_LOCK(ref_mutex);
83 if (ref_map == NULL) {
84 ref_map = new map<Regexp*, int>;
85 }
86 if (ref_ == kMaxRef) {
87 // already overflowed
88 (*ref_map)[this]++;
89 } else {
90 // overflowing now
91 (*ref_map)[this] = kMaxRef;
92 ref_ = kMaxRef;
93 }
94 GLOBAL_MUTEX_UNLOCK(ref_mutex);
95 return this;
96 }
97
98 ref_++;
99 return this;
100 }
101
102 // Decrements reference count and deletes this object if count reaches 0.
Decref()103 void Regexp::Decref() {
104 if (ref_ == kMaxRef) {
105 // Ref count is stored in overflow map.
106 GLOBAL_MUTEX_LOCK(ref_mutex);
107 int r = (*ref_map)[this] - 1;
108 if (r < kMaxRef) {
109 ref_ = r;
110 ref_map->erase(this);
111 } else {
112 (*ref_map)[this] = r;
113 }
114 GLOBAL_MUTEX_UNLOCK(ref_mutex);
115 return;
116 }
117 ref_--;
118 if (ref_ == 0)
119 Destroy();
120 }
121
122 // Deletes this object; ref count has count reached 0.
Destroy()123 void Regexp::Destroy() {
124 if (QuickDestroy())
125 return;
126
127 // Handle recursive Destroy with explicit stack
128 // to avoid arbitrarily deep recursion on process stack [sigh].
129 down_ = NULL;
130 Regexp* stack = this;
131 while (stack != NULL) {
132 Regexp* re = stack;
133 stack = re->down_;
134 if (re->ref_ != 0)
135 LOG(DFATAL) << "Bad reference count " << re->ref_;
136 if (re->nsub_ > 0) {
137 Regexp** subs = re->sub();
138 for (int i = 0; i < re->nsub_; i++) {
139 Regexp* sub = subs[i];
140 if (sub == NULL)
141 continue;
142 if (sub->ref_ == kMaxRef)
143 sub->Decref();
144 else
145 --sub->ref_;
146 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
147 sub->down_ = stack;
148 stack = sub;
149 }
150 }
151 if (re->nsub_ > 1)
152 delete[] subs;
153 re->nsub_ = 0;
154 }
155 delete re;
156 }
157 }
158
AddRuneToString(Rune r)159 void Regexp::AddRuneToString(Rune r) {
160 DCHECK(op_ == kRegexpLiteralString);
161 if (nrunes_ == 0) {
162 // start with 8
163 runes_ = new Rune[8];
164 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
165 // double on powers of two
166 Rune *old = runes_;
167 runes_ = new Rune[nrunes_ * 2];
168 for (int i = 0; i < nrunes_; i++)
169 runes_[i] = old[i];
170 delete[] old;
171 }
172
173 runes_[nrunes_++] = r;
174 }
175
HaveMatch(int match_id,ParseFlags flags)176 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
177 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
178 re->match_id_ = match_id;
179 return re;
180 }
181
Plus(Regexp * sub,ParseFlags flags)182 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
183 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
184 return sub;
185 Regexp* re = new Regexp(kRegexpPlus, flags);
186 re->AllocSub(1);
187 re->sub()[0] = sub;
188 return re;
189 }
190
Star(Regexp * sub,ParseFlags flags)191 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
192 if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
193 return sub;
194 Regexp* re = new Regexp(kRegexpStar, flags);
195 re->AllocSub(1);
196 re->sub()[0] = sub;
197 return re;
198 }
199
Quest(Regexp * sub,ParseFlags flags)200 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
201 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
202 return sub;
203 Regexp* re = new Regexp(kRegexpQuest, flags);
204 re->AllocSub(1);
205 re->sub()[0] = sub;
206 return re;
207 }
208
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)209 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
210 ParseFlags flags, bool can_factor) {
211 if (nsub == 1)
212 return sub[0];
213
214 Regexp** subcopy = NULL;
215 if (op == kRegexpAlternate && can_factor) {
216 // Going to edit sub; make a copy so we don't step on caller.
217 subcopy = new Regexp*[nsub];
218 memmove(subcopy, sub, nsub * sizeof sub[0]);
219 sub = subcopy;
220 nsub = FactorAlternation(sub, nsub, flags);
221 if (nsub == 1) {
222 Regexp* re = sub[0];
223 delete[] subcopy;
224 return re;
225 }
226 }
227
228 if (nsub > kMaxNsub) {
229 // Too many subexpressions to fit in a single Regexp.
230 // Make a two-level tree. Two levels gets us to 65535^2.
231 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
232 Regexp* re = new Regexp(op, flags);
233 re->AllocSub(nbigsub);
234 Regexp** subs = re->sub();
235 for (int i = 0; i < nbigsub - 1; i++)
236 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
237 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
238 nsub - (nbigsub-1)*kMaxNsub, flags,
239 false);
240 delete[] subcopy;
241 return re;
242 }
243
244 Regexp* re = new Regexp(op, flags);
245 re->AllocSub(nsub);
246 Regexp** subs = re->sub();
247 for (int i = 0; i < nsub; i++)
248 subs[i] = sub[i];
249
250 delete[] subcopy;
251 return re;
252 }
253
Concat(Regexp ** sub,int nsub,ParseFlags flags)254 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
255 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
256 }
257
Alternate(Regexp ** sub,int nsub,ParseFlags flags)258 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
259 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
260 }
261
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)262 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
263 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
264 }
265
Capture(Regexp * sub,ParseFlags flags,int cap)266 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
267 Regexp* re = new Regexp(kRegexpCapture, flags);
268 re->AllocSub(1);
269 re->sub()[0] = sub;
270 re->cap_ = cap;
271 return re;
272 }
273
Repeat(Regexp * sub,ParseFlags flags,int min,int max)274 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
275 Regexp* re = new Regexp(kRegexpRepeat, flags);
276 re->AllocSub(1);
277 re->sub()[0] = sub;
278 re->min_ = min;
279 re->max_ = max;
280 return re;
281 }
282
NewLiteral(Rune rune,ParseFlags flags)283 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
284 Regexp* re = new Regexp(kRegexpLiteral, flags);
285 re->rune_ = rune;
286 return re;
287 }
288
LiteralString(Rune * runes,int nrunes,ParseFlags flags)289 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
290 if (nrunes <= 0)
291 return new Regexp(kRegexpEmptyMatch, flags);
292 if (nrunes == 1)
293 return NewLiteral(runes[0], flags);
294 Regexp* re = new Regexp(kRegexpLiteralString, flags);
295 for (int i = 0; i < nrunes; i++)
296 re->AddRuneToString(runes[i]);
297 return re;
298 }
299
NewCharClass(CharClass * cc,ParseFlags flags)300 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
301 Regexp* re = new Regexp(kRegexpCharClass, flags);
302 re->cc_ = cc;
303 return re;
304 }
305
306 // Swaps this and that in place.
Swap(Regexp * that)307 void Regexp::Swap(Regexp* that) {
308 // Can use memmove because Regexp is just a struct (no vtable).
309 char tmp[sizeof *this];
310 memmove(tmp, this, sizeof tmp);
311 memmove(this, that, sizeof tmp);
312 memmove(that, tmp, sizeof tmp);
313 }
314
315 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)316 static bool TopEqual(Regexp* a, Regexp* b) {
317 if (a->op() != b->op())
318 return false;
319
320 switch (a->op()) {
321 case kRegexpNoMatch:
322 case kRegexpEmptyMatch:
323 case kRegexpAnyChar:
324 case kRegexpAnyByte:
325 case kRegexpBeginLine:
326 case kRegexpEndLine:
327 case kRegexpWordBoundary:
328 case kRegexpNoWordBoundary:
329 case kRegexpBeginText:
330 return true;
331
332 case kRegexpEndText:
333 // The parse flags remember whether it's \z or (?-m:$),
334 // which matters when testing against PCRE.
335 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
336
337 case kRegexpLiteral:
338 return a->rune() == b->rune() &&
339 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
340
341 case kRegexpLiteralString:
342 return a->nrunes() == b->nrunes() &&
343 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
344 memcmp(a->runes(), b->runes(),
345 a->nrunes() * sizeof a->runes()[0]) == 0;
346
347 case kRegexpAlternate:
348 case kRegexpConcat:
349 return a->nsub() == b->nsub();
350
351 case kRegexpStar:
352 case kRegexpPlus:
353 case kRegexpQuest:
354 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
355
356 case kRegexpRepeat:
357 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
358 a->min() == b->min() &&
359 a->max() == b->max();
360
361 case kRegexpCapture:
362 return a->cap() == b->cap() && a->name() == b->name();
363
364 case kRegexpHaveMatch:
365 return a->match_id() == b->match_id();
366
367 case kRegexpCharClass: {
368 CharClass* acc = a->cc();
369 CharClass* bcc = b->cc();
370 return acc->size() == bcc->size() &&
371 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
372 memcmp(acc->begin(), bcc->begin(),
373 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
374 }
375 }
376
377 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
378 return 0;
379 }
380
Equal(Regexp * a,Regexp * b)381 bool Regexp::Equal(Regexp* a, Regexp* b) {
382 if (a == NULL || b == NULL)
383 return a == b;
384
385 if (!TopEqual(a, b))
386 return false;
387
388 // Fast path:
389 // return without allocating vector if there are no subregexps.
390 switch (a->op()) {
391 case kRegexpAlternate:
392 case kRegexpConcat:
393 case kRegexpStar:
394 case kRegexpPlus:
395 case kRegexpQuest:
396 case kRegexpRepeat:
397 case kRegexpCapture:
398 break;
399
400 default:
401 return true;
402 }
403
404 // Committed to doing real work.
405 // The stack (vector) has pairs of regexps waiting to
406 // be compared. The regexps are only equal if
407 // all the pairs end up being equal.
408 vector<Regexp*> stk;
409
410 for (;;) {
411 // Invariant: TopEqual(a, b) == true.
412 Regexp* a2;
413 Regexp* b2;
414 switch (a->op()) {
415 default:
416 break;
417 case kRegexpAlternate:
418 case kRegexpConcat:
419 for (int i = 0; i < a->nsub(); i++) {
420 a2 = a->sub()[i];
421 b2 = b->sub()[i];
422 if (!TopEqual(a2, b2))
423 return false;
424 stk.push_back(a2);
425 stk.push_back(b2);
426 }
427 break;
428
429 case kRegexpStar:
430 case kRegexpPlus:
431 case kRegexpQuest:
432 case kRegexpRepeat:
433 case kRegexpCapture:
434 a2 = a->sub()[0];
435 b2 = b->sub()[0];
436 if (!TopEqual(a2, b2))
437 return false;
438 // Really:
439 // stk.push_back(a2);
440 // stk.push_back(b2);
441 // break;
442 // but faster to assign directly and loop.
443 a = a2;
444 b = b2;
445 continue;
446 }
447
448 int n = stk.size();
449 if (n == 0)
450 break;
451
452 a = stk[n-2];
453 b = stk[n-1];
454 stk.resize(n-2);
455 }
456
457 return true;
458 }
459
460 // Keep in sync with enum RegexpStatusCode in regexp.h
461 static const char *kErrorStrings[] = {
462 "no error",
463 "unexpected error",
464 "invalid escape sequence",
465 "invalid character class",
466 "invalid character class range",
467 "missing ]",
468 "missing )",
469 "trailing \\",
470 "no argument for repetition operator",
471 "invalid repetition size",
472 "bad repetition operator",
473 "invalid perl operator",
474 "invalid UTF-8",
475 "invalid named capture group",
476 };
477
CodeText(enum RegexpStatusCode code)478 string RegexpStatus::CodeText(enum RegexpStatusCode code) {
479 if (code < 0 || code >= arraysize(kErrorStrings))
480 code = kRegexpInternalError;
481 return kErrorStrings[code];
482 }
483
Text() const484 string RegexpStatus::Text() const {
485 if (error_arg_.empty())
486 return CodeText(code_);
487 string s;
488 s.append(CodeText(code_));
489 s.append(": ");
490 s.append(error_arg_.data(), error_arg_.size());
491 return s;
492 }
493
Copy(const RegexpStatus & status)494 void RegexpStatus::Copy(const RegexpStatus& status) {
495 code_ = status.code_;
496 error_arg_ = status.error_arg_;
497 }
498
499 typedef int Ignored; // Walker<void> doesn't exist
500
501 // Walker subclass to count capturing parens in regexp.
502 class NumCapturesWalker : public Regexp::Walker<Ignored> {
503 public:
NumCapturesWalker()504 NumCapturesWalker() : ncapture_(0) {}
ncapture()505 int ncapture() { return ncapture_; }
506
PreVisit(Regexp * re,Ignored ignored,bool * stop)507 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
508 if (re->op() == kRegexpCapture)
509 ncapture_++;
510 return ignored;
511 }
ShortVisit(Regexp * re,Ignored ignored)512 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
513 // Should never be called: we use Walk not WalkExponential.
514 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
515 return ignored;
516 }
517
518 private:
519 int ncapture_;
520 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
521 };
522
NumCaptures()523 int Regexp::NumCaptures() {
524 NumCapturesWalker w;
525 w.Walk(this, 0);
526 return w.ncapture();
527 }
528
529 // Walker class to build map of named capture groups and their indices.
530 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
531 public:
NamedCapturesWalker()532 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()533 ~NamedCapturesWalker() { delete map_; }
534
TakeMap()535 map<string, int>* TakeMap() {
536 map<string, int>* m = map_;
537 map_ = NULL;
538 return m;
539 }
540
PreVisit(Regexp * re,Ignored ignored,bool * stop)541 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
542 if (re->op() == kRegexpCapture && re->name() != NULL) {
543 // Allocate map once we find a name.
544 if (map_ == NULL)
545 map_ = new map<string, int>;
546
547 // Record first occurrence of each name.
548 // (The rule is that if you have the same name
549 // multiple times, only the leftmost one counts.)
550 if (map_->find(*re->name()) == map_->end())
551 (*map_)[*re->name()] = re->cap();
552 }
553 return ignored;
554 }
555
ShortVisit(Regexp * re,Ignored ignored)556 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
557 // Should never be called: we use Walk not WalkExponential.
558 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
559 return ignored;
560 }
561
562 private:
563 map<string, int>* map_;
564 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
565 };
566
NamedCaptures()567 map<string, int>* Regexp::NamedCaptures() {
568 NamedCapturesWalker w;
569 w.Walk(this, 0);
570 return w.TakeMap();
571 }
572
573 // Walker class to build map from capture group indices to their names.
574 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
575 public:
CaptureNamesWalker()576 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()577 ~CaptureNamesWalker() { delete map_; }
578
TakeMap()579 map<int, string>* TakeMap() {
580 map<int, string>* m = map_;
581 map_ = NULL;
582 return m;
583 }
584
PreVisit(Regexp * re,Ignored ignored,bool * stop)585 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
586 if (re->op() == kRegexpCapture && re->name() != NULL) {
587 // Allocate map once we find a name.
588 if (map_ == NULL)
589 map_ = new map<int, string>;
590
591 (*map_)[re->cap()] = *re->name();
592 }
593 return ignored;
594 }
595
ShortVisit(Regexp * re,Ignored ignored)596 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
597 // Should never be called: we use Walk not WalkExponential.
598 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
599 return ignored;
600 }
601
602 private:
603 map<int, string>* map_;
604 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
605 };
606
CaptureNames()607 map<int, string>* Regexp::CaptureNames() {
608 CaptureNamesWalker w;
609 w.Walk(this, 0);
610 return w.TakeMap();
611 }
612
613 // Determines whether regexp matches must be anchored
614 // with a fixed string prefix. If so, returns the prefix and
615 // the regexp that remains after the prefix. The prefix might
616 // be ASCII case-insensitive.
RequiredPrefix(string * prefix,bool * foldcase,Regexp ** suffix)617 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
618 // No need for a walker: the regexp must be of the form
619 // 1. some number of ^ anchors
620 // 2. a literal char or string
621 // 3. the rest
622 prefix->clear();
623 *foldcase = false;
624 *suffix = NULL;
625 if (op_ != kRegexpConcat)
626 return false;
627
628 // Some number of anchors, then a literal or concatenation.
629 int i = 0;
630 Regexp** sub = this->sub();
631 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
632 i++;
633 if (i == 0 || i >= nsub_)
634 return false;
635
636 Regexp* re = sub[i];
637 switch (re->op_) {
638 default:
639 return false;
640
641 case kRegexpLiteralString:
642 // Convert to string in proper encoding.
643 if (re->parse_flags() & Latin1) {
644 prefix->resize(re->nrunes_);
645 for (int j = 0; j < re->nrunes_; j++)
646 (*prefix)[j] = re->runes_[j];
647 } else {
648 // Convert to UTF-8 in place.
649 // Assume worst-case space and then trim.
650 prefix->resize(re->nrunes_ * UTFmax);
651 char *p = &(*prefix)[0];
652 for (int j = 0; j < re->nrunes_; j++) {
653 Rune r = re->runes_[j];
654 if (r < Runeself)
655 *p++ = r;
656 else
657 p += runetochar(p, &r);
658 }
659 prefix->resize(p - &(*prefix)[0]);
660 }
661 break;
662
663 case kRegexpLiteral:
664 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
665 prefix->append(1, re->rune_);
666 } else {
667 char buf[UTFmax];
668 prefix->append(buf, runetochar(buf, &re->rune_));
669 }
670 break;
671 }
672 *foldcase = (sub[i]->parse_flags() & FoldCase);
673 i++;
674
675 // The rest.
676 if (i < nsub_) {
677 for (int j = i; j < nsub_; j++)
678 sub[j]->Incref();
679 re = Concat(sub + i, nsub_ - i, parse_flags());
680 } else {
681 re = new Regexp(kRegexpEmptyMatch, parse_flags());
682 }
683 *suffix = re;
684 return true;
685 }
686
687 // Character class builder is a balanced binary tree (STL set)
688 // containing non-overlapping, non-abutting RuneRanges.
689 // The less-than operator used in the tree treats two
690 // ranges as equal if they overlap at all, so that
691 // lookups for a particular Rune are possible.
692
CharClassBuilder()693 CharClassBuilder::CharClassBuilder() {
694 nrunes_ = 0;
695 upper_ = 0;
696 lower_ = 0;
697 }
698
699 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)700 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
701 if (hi < lo)
702 return false;
703
704 if (lo <= 'z' && hi >= 'A') {
705 // Overlaps some alpha, maybe not all.
706 // Update bitmaps telling which ASCII letters are in the set.
707 Rune lo1 = max<Rune>(lo, 'A');
708 Rune hi1 = min<Rune>(hi, 'Z');
709 if (lo1 <= hi1)
710 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
711
712 lo1 = max<Rune>(lo, 'a');
713 hi1 = min<Rune>(hi, 'z');
714 if (lo1 <= hi1)
715 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
716 }
717
718 { // Check whether lo, hi is already in the class.
719 iterator it = ranges_.find(RuneRange(lo, lo));
720 if (it != end() && it->lo <= lo && hi <= it->hi)
721 return false;
722 }
723
724 // Look for a range abutting lo on the left.
725 // If it exists, take it out and increase our range.
726 if (lo > 0) {
727 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
728 if (it != end()) {
729 lo = it->lo;
730 if (it->hi > hi)
731 hi = it->hi;
732 nrunes_ -= it->hi - it->lo + 1;
733 ranges_.erase(it);
734 }
735 }
736
737 // Look for a range abutting hi on the right.
738 // If it exists, take it out and increase our range.
739 if (hi < Runemax) {
740 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
741 if (it != end()) {
742 hi = it->hi;
743 nrunes_ -= it->hi - it->lo + 1;
744 ranges_.erase(it);
745 }
746 }
747
748 // Look for ranges between lo and hi. Take them out.
749 // This is only safe because the set has no overlapping ranges.
750 // We've already removed any ranges abutting lo and hi, so
751 // any that overlap [lo, hi] must be contained within it.
752 for (;;) {
753 iterator it = ranges_.find(RuneRange(lo, hi));
754 if (it == end())
755 break;
756 nrunes_ -= it->hi - it->lo + 1;
757 ranges_.erase(it);
758 }
759
760 // Finally, add [lo, hi].
761 nrunes_ += hi - lo + 1;
762 ranges_.insert(RuneRange(lo, hi));
763 return true;
764 }
765
AddCharClass(CharClassBuilder * cc)766 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
767 for (iterator it = cc->begin(); it != cc->end(); ++it)
768 AddRange(it->lo, it->hi);
769 }
770
Contains(Rune r)771 bool CharClassBuilder::Contains(Rune r) {
772 return ranges_.find(RuneRange(r, r)) != end();
773 }
774
775 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()776 bool CharClassBuilder::FoldsASCII() {
777 return ((upper_ ^ lower_) & AlphaMask) == 0;
778 }
779
Copy()780 CharClassBuilder* CharClassBuilder::Copy() {
781 CharClassBuilder* cc = new CharClassBuilder;
782 for (iterator it = begin(); it != end(); ++it)
783 cc->ranges_.insert(RuneRange(it->lo, it->hi));
784 cc->upper_ = upper_;
785 cc->lower_ = lower_;
786 cc->nrunes_ = nrunes_;
787 return cc;
788 }
789
790
791
RemoveAbove(Rune r)792 void CharClassBuilder::RemoveAbove(Rune r) {
793 if (r >= Runemax)
794 return;
795
796 if (r < 'z') {
797 if (r < 'a')
798 lower_ = 0;
799 else
800 lower_ &= AlphaMask >> ('z' - r);
801 }
802
803 if (r < 'Z') {
804 if (r < 'A')
805 upper_ = 0;
806 else
807 upper_ &= AlphaMask >> ('Z' - r);
808 }
809
810 for (;;) {
811
812 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
813 if (it == end())
814 break;
815 RuneRange rr = *it;
816 ranges_.erase(it);
817 nrunes_ -= rr.hi - rr.lo + 1;
818 if (rr.lo <= r) {
819 rr.hi = r;
820 ranges_.insert(rr);
821 nrunes_ += rr.hi - rr.lo + 1;
822 }
823 }
824 }
825
Negate()826 void CharClassBuilder::Negate() {
827 // Build up negation and then copy in.
828 // Could edit ranges in place, but C++ won't let me.
829 vector<RuneRange> v;
830 v.reserve(ranges_.size() + 1);
831
832 // In negation, first range begins at 0, unless
833 // the current class begins at 0.
834 iterator it = begin();
835 if (it == end()) {
836 v.push_back(RuneRange(0, Runemax));
837 } else {
838 int nextlo = 0;
839 if (it->lo == 0) {
840 nextlo = it->hi + 1;
841 ++it;
842 }
843 for (; it != end(); ++it) {
844 v.push_back(RuneRange(nextlo, it->lo - 1));
845 nextlo = it->hi + 1;
846 }
847 if (nextlo <= Runemax)
848 v.push_back(RuneRange(nextlo, Runemax));
849 }
850
851 ranges_.clear();
852 for (int i = 0; i < v.size(); i++)
853 ranges_.insert(v[i]);
854
855 upper_ = AlphaMask & ~upper_;
856 lower_ = AlphaMask & ~lower_;
857 nrunes_ = Runemax+1 - nrunes_;
858 }
859
860 // Character class is a sorted list of ranges.
861 // The ranges are allocated in the same block as the header,
862 // necessitating a special allocator and Delete method.
863
New(int maxranges)864 CharClass* CharClass::New(int maxranges) {
865 CharClass* cc;
866 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
867 cc = reinterpret_cast<CharClass*>(data);
868 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
869 cc->nranges_ = 0;
870 cc->folds_ascii_ = false;
871 cc->nrunes_ = 0;
872 return cc;
873 }
874
Delete()875 void CharClass::Delete() {
876 if (this == NULL)
877 return;
878 uint8 *data = reinterpret_cast<uint8*>(this);
879 delete[] data;
880 }
881
Negate()882 CharClass* CharClass::Negate() {
883 CharClass* cc = CharClass::New(nranges_+1);
884 cc->folds_ascii_ = folds_ascii_;
885 cc->nrunes_ = Runemax + 1 - nrunes_;
886 int n = 0;
887 int nextlo = 0;
888 for (CharClass::iterator it = begin(); it != end(); ++it) {
889 if (it->lo == nextlo) {
890 nextlo = it->hi + 1;
891 } else {
892 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
893 nextlo = it->hi + 1;
894 }
895 }
896 if (nextlo <= Runemax)
897 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
898 cc->nranges_ = n;
899 return cc;
900 }
901
Contains(Rune r)902 bool CharClass::Contains(Rune r) {
903 RuneRange* rr = ranges_;
904 int n = nranges_;
905 while (n > 0) {
906 int m = n/2;
907 if (rr[m].hi < r) {
908 rr += m+1;
909 n -= m+1;
910 } else if (r < rr[m].lo) {
911 n = m;
912 } else { // rr[m].lo <= r && r <= rr[m].hi
913 return true;
914 }
915 }
916 return false;
917 }
918
GetCharClass()919 CharClass* CharClassBuilder::GetCharClass() {
920 CharClass* cc = CharClass::New(ranges_.size());
921 int n = 0;
922 for (iterator it = begin(); it != end(); ++it)
923 cc->ranges_[n++] = *it;
924 cc->nranges_ = n;
925 DCHECK_LE(n, ranges_.size());
926 cc->nrunes_ = nrunes_;
927 cc->folds_ascii_ = FoldsASCII();
928 return cc;
929 }
930
931 } // namespace re2
932