1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "util/util.h"
6 #include "util/flags.h"
7 #include "re2/prefilter.h"
8 #include "re2/prefilter_tree.h"
9 #include "re2/re2.h"
10 
11 DEFINE_int32(filtered_re2_min_atom_len,
12              3,
13              "Strings less than this length are not stored as atoms");
14 
15 namespace re2 {
16 
PrefilterTree()17 PrefilterTree::PrefilterTree()
18     : compiled_(false) {
19 }
20 
~PrefilterTree()21 PrefilterTree::~PrefilterTree() {
22   for (int i = 0; i < prefilter_vec_.size(); i++)
23     delete prefilter_vec_[i];
24 
25   for (int i = 0; i < entries_.size(); i++)
26     delete entries_[i].parents;
27 }
28 
29 // Functions used for adding and Compiling prefilters to the
30 // PrefilterTree.
KeepPart(Prefilter * prefilter,int level)31 static bool KeepPart(Prefilter* prefilter, int level) {
32   if (prefilter == NULL)
33     return false;
34 
35   switch (prefilter->op()) {
36     default:
37       LOG(DFATAL) << "Unexpected op in KeepPart: "
38                   << prefilter->op();
39       return false;
40 
41     case Prefilter::ALL:
42       return false;
43 
44     case Prefilter::ATOM:
45       return prefilter->atom().size() >=
46           FLAGS_filtered_re2_min_atom_len;
47 
48     case Prefilter::AND: {
49       int j = 0;
50       vector<Prefilter*>* subs = prefilter->subs();
51       for (int i = 0; i < subs->size(); i++)
52         if (KeepPart((*subs)[i], level + 1))
53           (*subs)[j++] = (*subs)[i];
54         else
55           delete (*subs)[i];
56 
57       subs->resize(j);
58       return j > 0;
59     }
60 
61     case Prefilter::OR:
62       for (int i = 0; i < prefilter->subs()->size(); i++)
63         if (!KeepPart((*prefilter->subs())[i], level + 1))
64           return false;
65       return true;
66   }
67 }
68 
Add(Prefilter * f)69 void PrefilterTree::Add(Prefilter *f) {
70   if (compiled_) {
71     LOG(DFATAL) << "Add after Compile.";
72     return;
73   }
74   if (f != NULL && !KeepPart(f, 0)) {
75     delete f;
76     f = NULL;
77   }
78 
79   prefilter_vec_.push_back(f);
80 }
81 
Compile(vector<string> * atom_vec)82 void PrefilterTree::Compile(vector<string>* atom_vec) {
83   if (compiled_) {
84     LOG(DFATAL) << "Compile after Compile.";
85     return;
86   }
87 
88   // We do this check to support some legacy uses of
89   // PrefilterTree that call Compile before adding any regexps,
90   // and expect Compile not to have effect.
91   if (prefilter_vec_.empty())
92     return;
93 
94   compiled_ = true;
95 
96   AssignUniqueIds(atom_vec);
97 
98   // Identify nodes that are too common among prefilters and are
99   // triggering too many parents. Then get rid of them if possible.
100   // Note that getting rid of a prefilter node simply means they are
101   // no longer necessary for their parent to trigger; that is, we do
102   // not miss out on any regexps triggering by getting rid of a
103   // prefilter node.
104   for (int i = 0; i < entries_.size(); i++) {
105     IntMap* parents = entries_[i].parents;
106     if (parents->size() > 8) {
107       // This one triggers too many things. If all the parents are AND
108       // nodes and have other things guarding them, then get rid of
109       // this trigger. TODO(vsri): Adjust the threshold appropriately,
110       // make it a function of total number of nodes?
111       bool have_other_guard = true;
112       for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
113         have_other_guard = have_other_guard &&
114             (entries_[it->index()].propagate_up_at_count > 1);
115 
116       if (have_other_guard) {
117         for (IntMap::iterator it = parents->begin();
118              it != parents->end(); ++it)
119           entries_[it->index()].propagate_up_at_count -= 1;
120 
121         parents->clear();  // Forget the parents
122       }
123     }
124   }
125 
126   PrintDebugInfo();
127 }
128 
CanonicalNode(Prefilter * node)129 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
130   string node_string = NodeString(node);
131   map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
132   if (iter == node_map_.end())
133     return NULL;
134   return (*iter).second;
135 }
136 
Itoa(int n)137 static string Itoa(int n) {
138   char buf[100];
139   snprintf(buf, sizeof buf, "%d", n);
140   return string(buf);
141 }
142 
NodeString(Prefilter * node) const143 string PrefilterTree::NodeString(Prefilter* node) const {
144   // Adding the operation disambiguates AND/OR/atom nodes.
145   string s = Itoa(node->op()) + ":";
146   if (node->op() == Prefilter::ATOM) {
147     s += node->atom();
148   } else {
149     for (int i = 0; i < node->subs()->size() ; i++) {
150       if (i > 0)
151         s += ',';
152       s += Itoa((*node->subs())[i]->unique_id());
153     }
154   }
155   return s;
156 }
157 
AssignUniqueIds(vector<string> * atom_vec)158 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
159   atom_vec->clear();
160 
161   // Build vector of all filter nodes, sorted topologically
162   // from top to bottom in v.
163   vector<Prefilter*> v;
164 
165   // Add the top level nodes of each regexp prefilter.
166   for (int i = 0; i < prefilter_vec_.size(); i++) {
167     Prefilter* f = prefilter_vec_[i];
168     if (f == NULL)
169       unfiltered_.push_back(i);
170 
171     // We push NULL also on to v, so that we maintain the
172     // mapping of index==regexpid for level=0 prefilter nodes.
173     v.push_back(f);
174   }
175 
176   // Now add all the descendant nodes.
177   for (int i = 0; i < v.size(); i++) {
178     Prefilter* f = v[i];
179     if (f == NULL)
180       continue;
181     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
182       const vector<Prefilter*>& subs = *f->subs();
183       for (int j = 0; j < subs.size(); j++)
184         v.push_back(subs[j]);
185     }
186   }
187 
188   // Identify unique nodes.
189   int unique_id = 0;
190   for (int i = v.size() - 1; i >= 0; i--) {
191     Prefilter *node = v[i];
192     if (node == NULL)
193       continue;
194     node->set_unique_id(-1);
195     Prefilter* canonical = CanonicalNode(node);
196     if (canonical == NULL) {
197       // Any further nodes that have the same node string
198       // will find this node as the canonical node.
199       node_map_[NodeString(node)] = node;
200       if (node->op() == Prefilter::ATOM) {
201         atom_vec->push_back(node->atom());
202         atom_index_to_id_.push_back(unique_id);
203       }
204       node->set_unique_id(unique_id++);
205     } else {
206       node->set_unique_id(canonical->unique_id());
207     }
208   }
209   entries_.resize(node_map_.size());
210 
211   // Create parent IntMap for the entries.
212   for (int i = v.size()  - 1; i >= 0; i--) {
213     Prefilter* prefilter = v[i];
214     if (prefilter == NULL)
215       continue;
216 
217     if (CanonicalNode(prefilter) != prefilter)
218       continue;
219 
220     Entry* entry = &entries_[prefilter->unique_id()];
221     entry->parents = new IntMap(node_map_.size());
222   }
223 
224   // Fill the entries.
225   for (int i = v.size()  - 1; i >= 0; i--) {
226     Prefilter* prefilter = v[i];
227     if (prefilter == NULL)
228       continue;
229 
230     if (CanonicalNode(prefilter) != prefilter)
231       continue;
232 
233     Entry* entry = &entries_[prefilter->unique_id()];
234 
235     switch (prefilter->op()) {
236       default:
237       case Prefilter::ALL:
238         LOG(DFATAL) << "Unexpected op: " << prefilter->op();
239         return;
240 
241       case Prefilter::ATOM:
242         entry->propagate_up_at_count = 1;
243         break;
244 
245       case Prefilter::OR:
246       case Prefilter::AND: {
247         IntMap uniq_child(node_map_.size());
248         for (int j = 0; j < prefilter->subs()->size() ; j++) {
249           Prefilter* child = (*prefilter->subs())[j];
250           Prefilter* canonical = CanonicalNode(child);
251           if (canonical == NULL) {
252             LOG(DFATAL) << "Null canonical node";
253             return;
254           }
255           int child_id = canonical->unique_id();
256           if (!uniq_child.has_index(child_id))
257             uniq_child.set_new(child_id, 1);
258           // To the child, we want to add to parent indices.
259           Entry* child_entry = &entries_[child_id];
260           if (!child_entry->parents->has_index(prefilter->unique_id()))
261             child_entry->parents->set_new(prefilter->unique_id(), 1);
262         }
263         entry->propagate_up_at_count =
264             prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
265 
266         break;
267       }
268     }
269   }
270 
271   // For top level nodes, populate regexp id.
272   for (int i = 0; i < prefilter_vec_.size(); i++) {
273     if (prefilter_vec_[i] == NULL)
274       continue;
275     int id = CanonicalNode(prefilter_vec_[i])->unique_id();
276     DCHECK_LE(0, id);
277     Entry* entry = &entries_[id];
278     entry->regexps.push_back(i);
279   }
280 }
281 
282 // Functions for triggering during search.
RegexpsGivenStrings(const vector<int> & matched_atoms,vector<int> * regexps) const283 void PrefilterTree::RegexpsGivenStrings(
284     const vector<int>& matched_atoms,
285     vector<int>* regexps) const {
286   regexps->clear();
287   if (!compiled_) {
288     LOG(WARNING) << "Compile() not called";
289     for (int i = 0; i < prefilter_vec_.size(); ++i)
290       regexps->push_back(i);
291   } else {
292     if (!prefilter_vec_.empty()) {
293       IntMap regexps_map(prefilter_vec_.size());
294       vector<int> matched_atom_ids;
295       for (int j = 0; j < matched_atoms.size(); j++) {
296         matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
297         VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
298       }
299       PropagateMatch(matched_atom_ids, &regexps_map);
300       for (IntMap::iterator it = regexps_map.begin();
301            it != regexps_map.end();
302            ++it)
303         regexps->push_back(it->index());
304 
305       regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
306     }
307   }
308   sort(regexps->begin(), regexps->end());
309 }
310 
PropagateMatch(const vector<int> & atom_ids,IntMap * regexps) const311 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
312                                    IntMap* regexps) const {
313   IntMap count(entries_.size());
314   IntMap work(entries_.size());
315   for (int i = 0; i < atom_ids.size(); i++)
316     work.set(atom_ids[i], 1);
317   for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
318     const Entry& entry = entries_[it->index()];
319     VLOG(10) << "Processing: " << it->index();
320     // Record regexps triggered.
321     for (int i = 0; i < entry.regexps.size(); i++) {
322       VLOG(10) << "Regexp triggered: " << entry.regexps[i];
323       regexps->set(entry.regexps[i], 1);
324     }
325     int c;
326     // Pass trigger up to parents.
327     for (IntMap::iterator it = entry.parents->begin();
328          it != entry.parents->end();
329          ++it) {
330       int j = it->index();
331       const Entry& parent = entries_[j];
332       VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
333       // Delay until all the children have succeeded.
334       if (parent.propagate_up_at_count > 1) {
335         if (count.has_index(j)) {
336           c = count.get_existing(j) + 1;
337           count.set_existing(j, c);
338         } else {
339           c = 1;
340           count.set_new(j, c);
341         }
342         if (c < parent.propagate_up_at_count)
343           continue;
344       }
345       VLOG(10) << "Triggering: " << j;
346       // Trigger the parent.
347       work.set(j, 1);
348     }
349   }
350 }
351 
352 // Debugging help.
PrintPrefilter(int regexpid)353 void PrefilterTree::PrintPrefilter(int regexpid) {
354   LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
355 }
356 
PrintDebugInfo()357 void PrefilterTree::PrintDebugInfo() {
358   VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
359   VLOG(10) << "#Unique Nodes: " << entries_.size();
360 
361   for (int i = 0; i < entries_.size(); ++i) {
362     IntMap* parents = entries_[i].parents;
363     const vector<int>& regexps = entries_[i].regexps;
364     VLOG(10) << "EntryId: " << i
365             << " N: " << parents->size() << " R: " << regexps.size();
366     for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
367       VLOG(10) << it->index();
368   }
369   VLOG(10) << "Map:";
370   for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
371        iter != node_map_.end(); ++iter)
372     VLOG(10) << "NodeId: " << (*iter).second->unique_id()
373             << " Str: " << (*iter).first;
374 }
375 
DebugNodeString(Prefilter * node) const376 string PrefilterTree::DebugNodeString(Prefilter* node) const {
377   string node_string = "";
378 
379   if (node->op() == Prefilter::ATOM) {
380     DCHECK(!node->atom().empty());
381     node_string += node->atom();
382   } else {
383     // Adding the operation disambiguates AND and OR nodes.
384     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
385     node_string += "(";
386     for (int i = 0; i < node->subs()->size() ; i++) {
387       if (i > 0)
388         node_string += ',';
389       node_string += Itoa((*node->subs())[i]->unique_id());
390       node_string += ":";
391       node_string += DebugNodeString((*node->subs())[i]);
392     }
393     node_string += ")";
394   }
395   return node_string;
396 }
397 
398 }  // namespace re2
399