1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/util.h"
6 #include "re2/prefilter.h"
7 #include "re2/re2.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
10
11 namespace re2 {
12
13 static const int Trace = false;
14
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
17
18 static int alloc_id = 100000; // Used for debugging.
19 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)20 Prefilter::Prefilter(Op op) {
21 op_ = op;
22 subs_ = NULL;
23 if (op_ == AND || op_ == OR)
24 subs_ = new vector<Prefilter*>;
25
26 alloc_id_ = alloc_id++;
27 VLOG(10) << "alloc_id: " << alloc_id_;
28 }
29
30 // Destroys a Prefilter.
~Prefilter()31 Prefilter::~Prefilter() {
32 VLOG(10) << "Deleted: " << alloc_id_;
33 if (subs_) {
34 for (int i = 0; i < subs_->size(); i++)
35 delete (*subs_)[i];
36 delete subs_;
37 subs_ = NULL;
38 }
39 }
40
41 // Simplify if the node is an empty Or or And.
Simplify()42 Prefilter* Prefilter::Simplify() {
43 if (op_ != AND && op_ != OR) {
44 return this;
45 }
46
47 // Nothing left in the AND/OR.
48 if (subs_->size() == 0) {
49 if (op_ == AND)
50 op_ = ALL; // AND of nothing is true
51 else
52 op_ = NONE; // OR of nothing is false
53
54 return this;
55 }
56
57 // Just one subnode: throw away wrapper.
58 if (subs_->size() == 1) {
59 Prefilter* a = (*subs_)[0];
60 subs_->clear();
61 delete this;
62 return a->Simplify();
63 }
64
65 return this;
66 }
67
68 // Combines two Prefilters together to create an "op" (AND or OR).
69 // The passed Prefilters will be part of the returned Prefilter or deleted.
70 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72 // If a, b can be rewritten as op, do so.
73 a = a->Simplify();
74 b = b->Simplify();
75
76 // Canonicalize: a->op <= b->op.
77 if (a->op() > b->op()) {
78 Prefilter* t = a;
79 a = b;
80 b = t;
81 }
82
83 // Trivial cases.
84 // ALL AND b = b
85 // NONE OR b = b
86 // ALL OR b = ALL
87 // NONE AND b = NONE
88 // Don't need to look at b, because of canonicalization above.
89 // ALL and NONE are smallest opcodes.
90 if (a->op() == ALL || a->op() == NONE) {
91 if ((a->op() == ALL && op == AND) ||
92 (a->op() == NONE && op == OR)) {
93 delete a;
94 return b;
95 } else {
96 delete b;
97 return a;
98 }
99 }
100
101 // If a and b match op, merge their contents.
102 if (a->op() == op && b->op() == op) {
103 for (int i = 0; i < b->subs()->size(); i++) {
104 Prefilter* bb = (*b->subs())[i];
105 a->subs()->push_back(bb);
106 }
107 b->subs()->clear();
108 delete b;
109 return a;
110 }
111
112 // If a already has the same op as the op that is under construction
113 // add in b (similarly if b already has the same op, add in a).
114 if (b->op() == op) {
115 Prefilter* t = a;
116 a = b;
117 b = t;
118 }
119 if (a->op() == op) {
120 a->subs()->push_back(b);
121 return a;
122 }
123
124 // Otherwise just return the op.
125 Prefilter* c = new Prefilter(op);
126 c->subs()->push_back(a);
127 c->subs()->push_back(b);
128 return c;
129 }
130
And(Prefilter * a,Prefilter * b)131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132 return AndOr(AND, a, b);
133 }
134
Or(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136 return AndOr(OR, a, b);
137 }
138
SimplifyStringSet(set<string> * ss)139 static void SimplifyStringSet(set<string> *ss) {
140 // Now make sure that the strings aren't redundant. For example, if
141 // we know "ab" is a required string, then it doesn't help at all to
142 // know that "abc" is also a required string, so delete "abc". This
143 // is because, when we are performing a string search to filter
144 // regexps, matching ab will already allow this regexp to be a
145 // candidate for match, so further matching abc is redundant.
146
147 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148 SSIter j = i;
149 ++j;
150 while (j != ss->end()) {
151 // Increment j early so that we can erase the element it points to.
152 SSIter old_j = j;
153 ++j;
154 if (old_j->find(*i) != string::npos)
155 ss->erase(old_j);
156 }
157 }
158 }
159
OrStrings(set<string> * ss)160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
161 SimplifyStringSet(ss);
162 Prefilter* or_prefilter = NULL;
163 if (!ss->empty()) {
164 or_prefilter = new Prefilter(NONE);
165 for (SSIter i = ss->begin(); i != ss->end(); ++i)
166 or_prefilter = Or(or_prefilter, FromString(*i));
167 }
168 return or_prefilter;
169 }
170
ToLowerRune(Rune r)171 static Rune ToLowerRune(Rune r) {
172 if (r < Runeself) {
173 if ('A' <= r && r <= 'Z')
174 r += 'a' - 'A';
175 return r;
176 }
177
178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179 if (f == NULL || r < f->lo)
180 return r;
181 return ApplyFold(f, r);
182 }
183
ToLowerRuneLatin1(Rune r)184 static Rune ToLowerRuneLatin1(Rune r) {
185 if ('A' <= r && r <= 'Z')
186 r += 'a' - 'A';
187 return r;
188 }
189
FromString(const string & str)190 Prefilter* Prefilter::FromString(const string& str) {
191 Prefilter* m = new Prefilter(Prefilter::ATOM);
192 m->atom_ = str;
193 return m;
194 }
195
196 // Information about a regexp used during computation of Prefilter.
197 // Can be thought of as information about the set of strings matching
198 // the given regular expression.
199 class Prefilter::Info {
200 public:
201 Info();
202 ~Info();
203
204 // More constructors. They delete their Info* arguments.
205 static Info* Alt(Info* a, Info* b);
206 static Info* Concat(Info* a, Info* b);
207 static Info* And(Info* a, Info* b);
208 static Info* Star(Info* a);
209 static Info* Plus(Info* a);
210 static Info* Quest(Info* a);
211 static Info* EmptyString();
212 static Info* NoMatch();
213 static Info* AnyChar();
214 static Info* CClass(CharClass* cc, bool latin1);
215 static Info* Literal(Rune r);
216 static Info* LiteralLatin1(Rune r);
217 static Info* AnyMatch();
218
219 // Format Info as a string.
220 string ToString();
221
222 // Caller takes ownership of the Prefilter.
223 Prefilter* TakeMatch();
224
exact()225 set<string>& exact() { return exact_; }
226
is_exact() const227 bool is_exact() const { return is_exact_; }
228
229 class Walker;
230
231 private:
232 set<string> exact_;
233
234 // When is_exact_ is true, the strings that match
235 // are placed in exact_. When it is no longer an exact
236 // set of strings that match this RE, then is_exact_
237 // is false and the match_ contains the required match
238 // criteria.
239 bool is_exact_;
240
241 // Accumulated Prefilter query that any
242 // match for this regexp is guaranteed to match.
243 Prefilter* match_;
244 };
245
246
Info()247 Prefilter::Info::Info()
248 : is_exact_(false),
249 match_(NULL) {
250 }
251
~Info()252 Prefilter::Info::~Info() {
253 delete match_;
254 }
255
TakeMatch()256 Prefilter* Prefilter::Info::TakeMatch() {
257 if (is_exact_) {
258 match_ = Prefilter::OrStrings(&exact_);
259 is_exact_ = false;
260 }
261 Prefilter* m = match_;
262 match_ = NULL;
263 return m;
264 }
265
266 // Format a Info in string form.
ToString()267 string Prefilter::Info::ToString() {
268 if (this == NULL) {
269 // Sometimes when iterating on children of a node,
270 // some children might have NULL Info. Adding
271 // the check here for NULL to take care of cases where
272 // the caller is not checking.
273 return "";
274 }
275
276 if (is_exact_) {
277 int n = 0;
278 string s;
279 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
280 if (n++ > 0)
281 s += ",";
282 s += *i;
283 }
284 return s;
285 }
286
287 if (match_)
288 return match_->DebugString();
289
290 return "";
291 }
292
293 // Add the strings from src to dst.
CopyIn(const set<string> & src,set<string> * dst)294 static void CopyIn(const set<string>& src, set<string>* dst) {
295 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
296 dst->insert(*i);
297 }
298
299 // Add the cross-product of a and b to dst.
300 // (For each string i in a and j in b, add i+j.)
CrossProduct(const set<string> & a,const set<string> & b,set<string> * dst)301 static void CrossProduct(const set<string>& a,
302 const set<string>& b,
303 set<string>* dst) {
304 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
305 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
306 dst->insert(*i + *j);
307 }
308
309 // Concats a and b. Requires that both are exact sets.
310 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)311 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
312 if (a == NULL)
313 return b;
314 DCHECK(a->is_exact_);
315 DCHECK(b && b->is_exact_);
316 Info *ab = new Info();
317
318 CrossProduct(a->exact_, b->exact_, &ab->exact_);
319 ab->is_exact_ = true;
320
321 delete a;
322 delete b;
323 return ab;
324 }
325
326 // Constructs an inexact Info for ab given a and b.
327 // Used only when a or b is not exact or when the
328 // exact cross product is likely to be too big.
And(Info * a,Info * b)329 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
330 if (a == NULL)
331 return b;
332 if (b == NULL)
333 return a;
334
335 Info *ab = new Info();
336
337 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
338 ab->is_exact_ = false;
339 delete a;
340 delete b;
341 return ab;
342 }
343
344 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)345 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
346 Info *ab = new Info();
347
348 if (a->is_exact_ && b->is_exact_) {
349 CopyIn(a->exact_, &ab->exact_);
350 CopyIn(b->exact_, &ab->exact_);
351 ab->is_exact_ = true;
352 } else {
353 // Either a or b has is_exact_ = false. If the other
354 // one has is_exact_ = true, we move it to match_ and
355 // then create a OR of a,b. The resulting Info has
356 // is_exact_ = false.
357 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
358 ab->is_exact_ = false;
359 }
360
361 delete a;
362 delete b;
363 return ab;
364 }
365
366 // Constructs Info for a? given a.
Quest(Info * a)367 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
368 Info *ab = new Info();
369
370 ab->is_exact_ = false;
371 ab->match_ = new Prefilter(ALL);
372 delete a;
373 return ab;
374 }
375
376 // Constructs Info for a* given a.
377 // Same as a? -- not much to do.
Star(Info * a)378 Prefilter::Info* Prefilter::Info::Star(Info *a) {
379 return Quest(a);
380 }
381
382 // Constructs Info for a+ given a. If a was exact set, it isn't
383 // anymore.
Plus(Info * a)384 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
385 Info *ab = new Info();
386
387 ab->match_ = a->TakeMatch();
388 ab->is_exact_ = false;
389
390 delete a;
391 return ab;
392 }
393
RuneToString(Rune r)394 static string RuneToString(Rune r) {
395 char buf[UTFmax];
396 int n = runetochar(buf, &r);
397 return string(buf, n);
398 }
399
RuneToStringLatin1(Rune r)400 static string RuneToStringLatin1(Rune r) {
401 char c = r & 0xff;
402 return string(&c, 1);
403 }
404
405 // Constructs Info for literal rune.
Literal(Rune r)406 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
407 Info* info = new Info();
408 info->exact_.insert(RuneToString(ToLowerRune(r)));
409 info->is_exact_ = true;
410 return info;
411 }
412
413 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)414 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
415 Info* info = new Info();
416 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
417 info->is_exact_ = true;
418 return info;
419 }
420
421 // Constructs Info for dot (any character).
AnyChar()422 Prefilter::Info* Prefilter::Info::AnyChar() {
423 Prefilter::Info* info = new Prefilter::Info();
424 info->match_ = new Prefilter(ALL);
425 return info;
426 }
427
428 // Constructs Prefilter::Info for no possible match.
NoMatch()429 Prefilter::Info* Prefilter::Info::NoMatch() {
430 Prefilter::Info* info = new Prefilter::Info();
431 info->match_ = new Prefilter(NONE);
432 return info;
433 }
434
435 // Constructs Prefilter::Info for any possible match.
436 // This Prefilter::Info is valid for any regular expression,
437 // since it makes no assertions whatsoever about the
438 // strings being matched.
AnyMatch()439 Prefilter::Info* Prefilter::Info::AnyMatch() {
440 Prefilter::Info *info = new Prefilter::Info();
441 info->match_ = new Prefilter(ALL);
442 return info;
443 }
444
445 // Constructs Prefilter::Info for just the empty string.
EmptyString()446 Prefilter::Info* Prefilter::Info::EmptyString() {
447 Prefilter::Info* info = new Prefilter::Info();
448 info->is_exact_ = true;
449 info->exact_.insert("");
450 return info;
451 }
452
453 // Constructs Prefilter::Info for a character class.
454 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)455 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
456 bool latin1) {
457 if (Trace) {
458 VLOG(0) << "CharClassInfo:";
459 for (CCIter i = cc->begin(); i != cc->end(); ++i)
460 VLOG(0) << " " << i->lo << "-" << i->hi;
461 }
462
463 // If the class is too large, it's okay to overestimate.
464 if (cc->size() > 10)
465 return AnyChar();
466
467 Prefilter::Info *a = new Prefilter::Info();
468 for (CCIter i = cc->begin(); i != cc->end(); ++i)
469 for (Rune r = i->lo; r <= i->hi; r++) {
470 if (latin1) {
471 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
472 } else {
473 a->exact_.insert(RuneToString(ToLowerRune(r)));
474 }
475 }
476
477
478 a->is_exact_ = true;
479
480 if (Trace) {
481 VLOG(0) << " = " << a->ToString();
482 }
483
484 return a;
485 }
486
487 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
488 public:
Walker(bool latin1)489 Walker(bool latin1) : latin1_(latin1) {}
490
491 virtual Info* PostVisit(
492 Regexp* re, Info* parent_arg,
493 Info* pre_arg,
494 Info** child_args, int nchild_args);
495
496 virtual Info* ShortVisit(
497 Regexp* re,
498 Info* parent_arg);
499
latin1()500 bool latin1() { return latin1_; }
501 private:
502 bool latin1_;
503 DISALLOW_EVIL_CONSTRUCTORS(Walker);
504 };
505
BuildInfo(Regexp * re)506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
507 if (Trace) {
508 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
509 }
510
511 bool latin1 = re->parse_flags() & Regexp::Latin1;
512 Prefilter::Info::Walker w(latin1);
513 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
514
515 if (w.stopped_early()) {
516 delete info;
517 return NULL;
518 }
519
520 return info;
521 }
522
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)523 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
524 Regexp* re, Prefilter::Info* parent_arg) {
525 return AnyMatch();
526 }
527
528 // Constructs the Prefilter::Info for the given regular expression.
529 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)530 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
531 Regexp* re, Prefilter::Info* parent_arg,
532 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
533 int nchild_args) {
534 Prefilter::Info *info;
535 switch (re->op()) {
536 default:
537 case kRegexpRepeat:
538 LOG(DFATAL) << "Bad regexp op " << re->op();
539 info = EmptyString();
540 break;
541
542 case kRegexpNoMatch:
543 info = NoMatch();
544 break;
545
546 // These ops match the empty string:
547 case kRegexpEmptyMatch: // anywhere
548 case kRegexpBeginLine: // at beginning of line
549 case kRegexpEndLine: // at end of line
550 case kRegexpBeginText: // at beginning of text
551 case kRegexpEndText: // at end of text
552 case kRegexpWordBoundary: // at word boundary
553 case kRegexpNoWordBoundary: // not at word boundary
554 info = EmptyString();
555 break;
556
557 case kRegexpLiteral:
558 if (latin1()) {
559 info = LiteralLatin1(re->rune());
560 }
561 else {
562 info = Literal(re->rune());
563 }
564 break;
565
566 case kRegexpLiteralString:
567 if (re->nrunes() == 0) {
568 info = NoMatch();
569 break;
570 }
571 if (latin1()) {
572 info = LiteralLatin1(re->runes()[0]);
573 for (int i = 1; i < re->nrunes(); i++) {
574 info = Concat(info, LiteralLatin1(re->runes()[i]));
575 }
576 } else {
577 info = Literal(re->runes()[0]);
578 for (int i = 1; i < re->nrunes(); i++) {
579 info = Concat(info, Literal(re->runes()[i]));
580 }
581 }
582 break;
583
584 case kRegexpConcat: {
585 // Accumulate in info.
586 // Exact is concat of recent contiguous exact nodes.
587 info = NULL;
588 Info* exact = NULL;
589 for (int i = 0; i < nchild_args; i++) {
590 Info* ci = child_args[i]; // child info
591 if (!ci->is_exact() ||
592 (exact && ci->exact().size() * exact->exact().size() > 16)) {
593 // Exact run is over.
594 info = And(info, exact);
595 exact = NULL;
596 // Add this child's info.
597 info = And(info, ci);
598 } else {
599 // Append to exact run.
600 exact = Concat(exact, ci);
601 }
602 }
603 info = And(info, exact);
604 }
605 break;
606
607 case kRegexpAlternate:
608 info = child_args[0];
609 for (int i = 1; i < nchild_args; i++)
610 info = Alt(info, child_args[i]);
611 VLOG(10) << "Alt: " << info->ToString();
612 break;
613
614 case kRegexpStar:
615 info = Star(child_args[0]);
616 break;
617
618 case kRegexpQuest:
619 info = Quest(child_args[0]);
620 break;
621
622 case kRegexpPlus:
623 info = Plus(child_args[0]);
624 break;
625
626 case kRegexpAnyChar:
627 // Claim nothing, except that it's not empty.
628 info = AnyChar();
629 break;
630
631 case kRegexpCharClass:
632 info = CClass(re->cc(), latin1());
633 break;
634
635 case kRegexpCapture:
636 // These don't affect the set of matching strings.
637 info = child_args[0];
638 break;
639 }
640
641 if (Trace) {
642 VLOG(0) << "BuildInfo " << re->ToString()
643 << ": " << info->ToString();
644 }
645
646 return info;
647 }
648
649
FromRegexp(Regexp * re)650 Prefilter* Prefilter::FromRegexp(Regexp* re) {
651 if (re == NULL)
652 return NULL;
653
654 Regexp* simple = re->Simplify();
655 Prefilter::Info *info = BuildInfo(simple);
656
657 simple->Decref();
658 if (info == NULL)
659 return NULL;
660
661 Prefilter* m = info->TakeMatch();
662
663 delete info;
664 return m;
665 }
666
DebugString() const667 string Prefilter::DebugString() const {
668 if (this == NULL)
669 return "<nil>";
670
671 switch (op_) {
672 default:
673 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
674 return StringPrintf("op%d", op_);
675 case NONE:
676 return "*no-matches*";
677 case ATOM:
678 return atom_;
679 case ALL:
680 return "";
681 case AND: {
682 string s = "";
683 for (int i = 0; i < subs_->size(); i++) {
684 if (i > 0)
685 s += " ";
686 s += (*subs_)[i]->DebugString();
687 }
688 return s;
689 }
690 case OR: {
691 string s = "(";
692 for (int i = 0; i < subs_->size(); i++) {
693 if (i > 0)
694 s += "|";
695 s += (*subs_)[i]->DebugString();
696 }
697 s += ")";
698 return s;
699 }
700 }
701 }
702
FromRE2(const RE2 * re2)703 Prefilter* Prefilter::FromRE2(const RE2* re2) {
704 if (re2 == NULL)
705 return NULL;
706
707 Regexp* regexp = re2->Regexp();
708 if (regexp == NULL)
709 return NULL;
710
711 return FromRegexp(regexp);
712 }
713
714
715 } // namespace re2
716