1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "util/util.h"
6 #include "re2/prefilter.h"
7 #include "re2/re2.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
10 
11 namespace re2 {
12 
13 static const int Trace = false;
14 
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
17 
18 static int alloc_id = 100000;  // Used for debugging.
19 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)20 Prefilter::Prefilter(Op op) {
21   op_ = op;
22   subs_ = NULL;
23   if (op_ == AND || op_ == OR)
24     subs_ = new vector<Prefilter*>;
25 
26   alloc_id_ = alloc_id++;
27   VLOG(10) << "alloc_id: " << alloc_id_;
28 }
29 
30 // Destroys a Prefilter.
~Prefilter()31 Prefilter::~Prefilter() {
32   VLOG(10) << "Deleted: " << alloc_id_;
33   if (subs_) {
34     for (int i = 0; i < subs_->size(); i++)
35       delete (*subs_)[i];
36     delete subs_;
37     subs_ = NULL;
38   }
39 }
40 
41 // Simplify if the node is an empty Or or And.
Simplify()42 Prefilter* Prefilter::Simplify() {
43   if (op_ != AND && op_ != OR) {
44     return this;
45   }
46 
47   // Nothing left in the AND/OR.
48   if (subs_->size() == 0) {
49     if (op_ == AND)
50       op_ = ALL;  // AND of nothing is true
51     else
52       op_ = NONE;  // OR of nothing is false
53 
54     return this;
55   }
56 
57   // Just one subnode: throw away wrapper.
58   if (subs_->size() == 1) {
59     Prefilter* a = (*subs_)[0];
60     subs_->clear();
61     delete this;
62     return a->Simplify();
63   }
64 
65   return this;
66 }
67 
68 // Combines two Prefilters together to create an "op" (AND or OR).
69 // The passed Prefilters will be part of the returned Prefilter or deleted.
70 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72   // If a, b can be rewritten as op, do so.
73   a = a->Simplify();
74   b = b->Simplify();
75 
76   // Canonicalize: a->op <= b->op.
77   if (a->op() > b->op()) {
78     Prefilter* t = a;
79     a = b;
80     b = t;
81   }
82 
83   // Trivial cases.
84   //    ALL AND b = b
85   //    NONE OR b = b
86   //    ALL OR b   = ALL
87   //    NONE AND b = NONE
88   // Don't need to look at b, because of canonicalization above.
89   // ALL and NONE are smallest opcodes.
90   if (a->op() == ALL || a->op() == NONE) {
91     if ((a->op() == ALL && op == AND) ||
92         (a->op() == NONE && op == OR)) {
93       delete a;
94       return b;
95     } else {
96       delete b;
97       return a;
98     }
99   }
100 
101   // If a and b match op, merge their contents.
102   if (a->op() == op && b->op() == op) {
103     for (int i = 0; i < b->subs()->size(); i++) {
104       Prefilter* bb = (*b->subs())[i];
105       a->subs()->push_back(bb);
106     }
107     b->subs()->clear();
108     delete b;
109     return a;
110   }
111 
112   // If a already has the same op as the op that is under construction
113   // add in b (similarly if b already has the same op, add in a).
114   if (b->op() == op) {
115     Prefilter* t = a;
116     a = b;
117     b = t;
118   }
119   if (a->op() == op) {
120     a->subs()->push_back(b);
121     return a;
122   }
123 
124   // Otherwise just return the op.
125   Prefilter* c = new Prefilter(op);
126   c->subs()->push_back(a);
127   c->subs()->push_back(b);
128   return c;
129 }
130 
And(Prefilter * a,Prefilter * b)131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132   return AndOr(AND, a, b);
133 }
134 
Or(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136   return AndOr(OR, a, b);
137 }
138 
SimplifyStringSet(set<string> * ss)139 static void SimplifyStringSet(set<string> *ss) {
140   // Now make sure that the strings aren't redundant.  For example, if
141   // we know "ab" is a required string, then it doesn't help at all to
142   // know that "abc" is also a required string, so delete "abc". This
143   // is because, when we are performing a string search to filter
144   // regexps, matching ab will already allow this regexp to be a
145   // candidate for match, so further matching abc is redundant.
146 
147   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148     SSIter j = i;
149     ++j;
150     while (j != ss->end()) {
151       // Increment j early so that we can erase the element it points to.
152       SSIter old_j = j;
153       ++j;
154       if (old_j->find(*i) != string::npos)
155         ss->erase(old_j);
156     }
157   }
158 }
159 
OrStrings(set<string> * ss)160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
161   SimplifyStringSet(ss);
162   Prefilter* or_prefilter = NULL;
163   if (!ss->empty()) {
164     or_prefilter = new Prefilter(NONE);
165     for (SSIter i = ss->begin(); i != ss->end(); ++i)
166       or_prefilter = Or(or_prefilter, FromString(*i));
167   }
168   return or_prefilter;
169 }
170 
ToLowerRune(Rune r)171 static Rune ToLowerRune(Rune r) {
172   if (r < Runeself) {
173     if ('A' <= r && r <= 'Z')
174       r += 'a' - 'A';
175     return r;
176   }
177 
178   CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179   if (f == NULL || r < f->lo)
180     return r;
181   return ApplyFold(f, r);
182 }
183 
ToLowerRuneLatin1(Rune r)184 static Rune ToLowerRuneLatin1(Rune r) {
185   if ('A' <= r && r <= 'Z')
186     r += 'a' - 'A';
187   return r;
188 }
189 
FromString(const string & str)190 Prefilter* Prefilter::FromString(const string& str) {
191   Prefilter* m = new Prefilter(Prefilter::ATOM);
192   m->atom_ = str;
193   return m;
194 }
195 
196 // Information about a regexp used during computation of Prefilter.
197 // Can be thought of as information about the set of strings matching
198 // the given regular expression.
199 class Prefilter::Info {
200  public:
201   Info();
202   ~Info();
203 
204   // More constructors.  They delete their Info* arguments.
205   static Info* Alt(Info* a, Info* b);
206   static Info* Concat(Info* a, Info* b);
207   static Info* And(Info* a, Info* b);
208   static Info* Star(Info* a);
209   static Info* Plus(Info* a);
210   static Info* Quest(Info* a);
211   static Info* EmptyString();
212   static Info* NoMatch();
213   static Info* AnyChar();
214   static Info* CClass(CharClass* cc, bool latin1);
215   static Info* Literal(Rune r);
216   static Info* LiteralLatin1(Rune r);
217   static Info* AnyMatch();
218 
219   // Format Info as a string.
220   string ToString();
221 
222   // Caller takes ownership of the Prefilter.
223   Prefilter* TakeMatch();
224 
exact()225   set<string>& exact() { return exact_; }
226 
is_exact() const227   bool is_exact() const { return is_exact_; }
228 
229   class Walker;
230 
231  private:
232   set<string> exact_;
233 
234   // When is_exact_ is true, the strings that match
235   // are placed in exact_. When it is no longer an exact
236   // set of strings that match this RE, then is_exact_
237   // is false and the match_ contains the required match
238   // criteria.
239   bool is_exact_;
240 
241   // Accumulated Prefilter query that any
242   // match for this regexp is guaranteed to match.
243   Prefilter* match_;
244 };
245 
246 
Info()247 Prefilter::Info::Info()
248   : is_exact_(false),
249     match_(NULL) {
250 }
251 
~Info()252 Prefilter::Info::~Info() {
253   delete match_;
254 }
255 
TakeMatch()256 Prefilter* Prefilter::Info::TakeMatch() {
257   if (is_exact_) {
258     match_ = Prefilter::OrStrings(&exact_);
259     is_exact_ = false;
260   }
261   Prefilter* m = match_;
262   match_ = NULL;
263   return m;
264 }
265 
266 // Format a Info in string form.
ToString()267 string Prefilter::Info::ToString() {
268   if (this == NULL) {
269     // Sometimes when iterating on children of a node,
270     // some children might have NULL Info. Adding
271     // the check here for NULL to take care of cases where
272     // the caller is not checking.
273     return "";
274   }
275 
276   if (is_exact_) {
277     int n = 0;
278     string s;
279     for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
280       if (n++ > 0)
281         s += ",";
282       s += *i;
283     }
284     return s;
285   }
286 
287   if (match_)
288     return match_->DebugString();
289 
290   return "";
291 }
292 
293 // Add the strings from src to dst.
CopyIn(const set<string> & src,set<string> * dst)294 static void CopyIn(const set<string>& src, set<string>* dst) {
295   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
296     dst->insert(*i);
297 }
298 
299 // Add the cross-product of a and b to dst.
300 // (For each string i in a and j in b, add i+j.)
CrossProduct(const set<string> & a,const set<string> & b,set<string> * dst)301 static void CrossProduct(const set<string>& a,
302                          const set<string>& b,
303                          set<string>* dst) {
304   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
305     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
306       dst->insert(*i + *j);
307 }
308 
309 // Concats a and b. Requires that both are exact sets.
310 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)311 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
312   if (a == NULL)
313     return b;
314   DCHECK(a->is_exact_);
315   DCHECK(b && b->is_exact_);
316   Info *ab = new Info();
317 
318   CrossProduct(a->exact_, b->exact_, &ab->exact_);
319   ab->is_exact_ = true;
320 
321   delete a;
322   delete b;
323   return ab;
324 }
325 
326 // Constructs an inexact Info for ab given a and b.
327 // Used only when a or b is not exact or when the
328 // exact cross product is likely to be too big.
And(Info * a,Info * b)329 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
330   if (a == NULL)
331     return b;
332   if (b == NULL)
333     return a;
334 
335   Info *ab = new Info();
336 
337   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
338   ab->is_exact_ = false;
339   delete a;
340   delete b;
341   return ab;
342 }
343 
344 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)345 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
346   Info *ab = new Info();
347 
348   if (a->is_exact_ && b->is_exact_) {
349     CopyIn(a->exact_, &ab->exact_);
350     CopyIn(b->exact_, &ab->exact_);
351     ab->is_exact_ = true;
352   } else {
353     // Either a or b has is_exact_ = false. If the other
354     // one has is_exact_ = true, we move it to match_ and
355     // then create a OR of a,b. The resulting Info has
356     // is_exact_ = false.
357     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
358     ab->is_exact_ = false;
359   }
360 
361   delete a;
362   delete b;
363   return ab;
364 }
365 
366 // Constructs Info for a? given a.
Quest(Info * a)367 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
368   Info *ab = new Info();
369 
370   ab->is_exact_ = false;
371   ab->match_ = new Prefilter(ALL);
372   delete a;
373   return ab;
374 }
375 
376 // Constructs Info for a* given a.
377 // Same as a? -- not much to do.
Star(Info * a)378 Prefilter::Info* Prefilter::Info::Star(Info *a) {
379   return Quest(a);
380 }
381 
382 // Constructs Info for a+ given a. If a was exact set, it isn't
383 // anymore.
Plus(Info * a)384 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
385   Info *ab = new Info();
386 
387   ab->match_ = a->TakeMatch();
388   ab->is_exact_ = false;
389 
390   delete a;
391   return ab;
392 }
393 
RuneToString(Rune r)394 static string RuneToString(Rune r) {
395   char buf[UTFmax];
396   int n = runetochar(buf, &r);
397   return string(buf, n);
398 }
399 
RuneToStringLatin1(Rune r)400 static string RuneToStringLatin1(Rune r) {
401   char c = r & 0xff;
402   return string(&c, 1);
403 }
404 
405 // Constructs Info for literal rune.
Literal(Rune r)406 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
407   Info* info = new Info();
408   info->exact_.insert(RuneToString(ToLowerRune(r)));
409   info->is_exact_ = true;
410   return info;
411 }
412 
413 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)414 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
415   Info* info = new Info();
416   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
417   info->is_exact_ = true;
418   return info;
419 }
420 
421 // Constructs Info for dot (any character).
AnyChar()422 Prefilter::Info* Prefilter::Info::AnyChar() {
423   Prefilter::Info* info = new Prefilter::Info();
424   info->match_ = new Prefilter(ALL);
425   return info;
426 }
427 
428 // Constructs Prefilter::Info for no possible match.
NoMatch()429 Prefilter::Info* Prefilter::Info::NoMatch() {
430   Prefilter::Info* info = new Prefilter::Info();
431   info->match_ = new Prefilter(NONE);
432   return info;
433 }
434 
435 // Constructs Prefilter::Info for any possible match.
436 // This Prefilter::Info is valid for any regular expression,
437 // since it makes no assertions whatsoever about the
438 // strings being matched.
AnyMatch()439 Prefilter::Info* Prefilter::Info::AnyMatch() {
440   Prefilter::Info *info = new Prefilter::Info();
441   info->match_ = new Prefilter(ALL);
442   return info;
443 }
444 
445 // Constructs Prefilter::Info for just the empty string.
EmptyString()446 Prefilter::Info* Prefilter::Info::EmptyString() {
447   Prefilter::Info* info = new Prefilter::Info();
448   info->is_exact_ = true;
449   info->exact_.insert("");
450   return info;
451 }
452 
453 // Constructs Prefilter::Info for a character class.
454 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)455 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
456                                          bool latin1) {
457   if (Trace) {
458     VLOG(0) << "CharClassInfo:";
459     for (CCIter i = cc->begin(); i != cc->end(); ++i)
460       VLOG(0) << "  " << i->lo << "-" << i->hi;
461   }
462 
463   // If the class is too large, it's okay to overestimate.
464   if (cc->size() > 10)
465     return AnyChar();
466 
467   Prefilter::Info *a = new Prefilter::Info();
468   for (CCIter i = cc->begin(); i != cc->end(); ++i)
469     for (Rune r = i->lo; r <= i->hi; r++) {
470       if (latin1) {
471         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
472       } else {
473         a->exact_.insert(RuneToString(ToLowerRune(r)));
474       }
475     }
476 
477 
478   a->is_exact_ = true;
479 
480   if (Trace) {
481     VLOG(0) << " = " << a->ToString();
482   }
483 
484   return a;
485 }
486 
487 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
488  public:
Walker(bool latin1)489   Walker(bool latin1) : latin1_(latin1) {}
490 
491   virtual Info* PostVisit(
492       Regexp* re, Info* parent_arg,
493       Info* pre_arg,
494       Info** child_args, int nchild_args);
495 
496   virtual Info* ShortVisit(
497       Regexp* re,
498       Info* parent_arg);
499 
latin1()500   bool latin1() { return latin1_; }
501  private:
502   bool latin1_;
503   DISALLOW_EVIL_CONSTRUCTORS(Walker);
504 };
505 
BuildInfo(Regexp * re)506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
507   if (Trace) {
508     LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
509   }
510 
511   bool latin1 = re->parse_flags() & Regexp::Latin1;
512   Prefilter::Info::Walker w(latin1);
513   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
514 
515   if (w.stopped_early()) {
516     delete info;
517     return NULL;
518   }
519 
520   return info;
521 }
522 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)523 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
524     Regexp* re, Prefilter::Info* parent_arg) {
525   return AnyMatch();
526 }
527 
528 // Constructs the Prefilter::Info for the given regular expression.
529 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)530 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
531     Regexp* re, Prefilter::Info* parent_arg,
532     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
533     int nchild_args) {
534   Prefilter::Info *info;
535   switch (re->op()) {
536     default:
537     case kRegexpRepeat:
538       LOG(DFATAL) << "Bad regexp op " << re->op();
539       info = EmptyString();
540       break;
541 
542     case kRegexpNoMatch:
543       info = NoMatch();
544       break;
545 
546     // These ops match the empty string:
547     case kRegexpEmptyMatch:      // anywhere
548     case kRegexpBeginLine:       // at beginning of line
549     case kRegexpEndLine:         // at end of line
550     case kRegexpBeginText:       // at beginning of text
551     case kRegexpEndText:         // at end of text
552     case kRegexpWordBoundary:    // at word boundary
553     case kRegexpNoWordBoundary:  // not at word boundary
554       info = EmptyString();
555       break;
556 
557     case kRegexpLiteral:
558       if (latin1()) {
559         info = LiteralLatin1(re->rune());
560       }
561       else {
562         info = Literal(re->rune());
563       }
564       break;
565 
566     case kRegexpLiteralString:
567       if (re->nrunes() == 0) {
568         info = NoMatch();
569         break;
570       }
571       if (latin1()) {
572         info = LiteralLatin1(re->runes()[0]);
573         for (int i = 1; i < re->nrunes(); i++) {
574           info = Concat(info, LiteralLatin1(re->runes()[i]));
575         }
576       } else {
577         info = Literal(re->runes()[0]);
578         for (int i = 1; i < re->nrunes(); i++) {
579           info = Concat(info, Literal(re->runes()[i]));
580         }
581       }
582       break;
583 
584     case kRegexpConcat: {
585       // Accumulate in info.
586       // Exact is concat of recent contiguous exact nodes.
587       info = NULL;
588       Info* exact = NULL;
589       for (int i = 0; i < nchild_args; i++) {
590         Info* ci = child_args[i];  // child info
591         if (!ci->is_exact() ||
592             (exact && ci->exact().size() * exact->exact().size() > 16)) {
593           // Exact run is over.
594           info = And(info, exact);
595           exact = NULL;
596           // Add this child's info.
597           info = And(info, ci);
598         } else {
599           // Append to exact run.
600           exact = Concat(exact, ci);
601         }
602       }
603       info = And(info, exact);
604     }
605       break;
606 
607     case kRegexpAlternate:
608       info = child_args[0];
609       for (int i = 1; i < nchild_args; i++)
610         info = Alt(info, child_args[i]);
611       VLOG(10) << "Alt: " << info->ToString();
612       break;
613 
614     case kRegexpStar:
615       info = Star(child_args[0]);
616       break;
617 
618     case kRegexpQuest:
619       info = Quest(child_args[0]);
620       break;
621 
622     case kRegexpPlus:
623       info = Plus(child_args[0]);
624       break;
625 
626     case kRegexpAnyChar:
627       // Claim nothing, except that it's not empty.
628       info = AnyChar();
629       break;
630 
631     case kRegexpCharClass:
632       info = CClass(re->cc(), latin1());
633       break;
634 
635     case kRegexpCapture:
636       // These don't affect the set of matching strings.
637       info = child_args[0];
638       break;
639   }
640 
641   if (Trace) {
642     VLOG(0) << "BuildInfo " << re->ToString()
643             << ": " << info->ToString();
644   }
645 
646   return info;
647 }
648 
649 
FromRegexp(Regexp * re)650 Prefilter* Prefilter::FromRegexp(Regexp* re) {
651   if (re == NULL)
652     return NULL;
653 
654   Regexp* simple = re->Simplify();
655   Prefilter::Info *info = BuildInfo(simple);
656 
657   simple->Decref();
658   if (info == NULL)
659     return NULL;
660 
661   Prefilter* m = info->TakeMatch();
662 
663   delete info;
664   return m;
665 }
666 
DebugString() const667 string Prefilter::DebugString() const {
668   if (this == NULL)
669     return "<nil>";
670 
671   switch (op_) {
672     default:
673       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
674       return StringPrintf("op%d", op_);
675     case NONE:
676       return "*no-matches*";
677     case ATOM:
678       return atom_;
679     case ALL:
680       return "";
681     case AND: {
682       string s = "";
683       for (int i = 0; i < subs_->size(); i++) {
684         if (i > 0)
685           s += " ";
686         s += (*subs_)[i]->DebugString();
687       }
688       return s;
689     }
690     case OR: {
691       string s = "(";
692       for (int i = 0; i < subs_->size(); i++) {
693         if (i > 0)
694           s += "|";
695         s += (*subs_)[i]->DebugString();
696       }
697       s += ")";
698       return s;
699     }
700   }
701 }
702 
FromRE2(const RE2 * re2)703 Prefilter* Prefilter::FromRE2(const RE2* re2) {
704   if (re2 == NULL)
705     return NULL;
706 
707   Regexp* regexp = re2->Regexp();
708   if (regexp == NULL)
709     return NULL;
710 
711   return FromRegexp(regexp);
712 }
713 
714 
715 }  // namespace re2
716