1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // The PrefilterTree class is used to form an AND-OR tree of strings
6 // that would trigger each regexp. The 'prefilter' of each regexp is
7 // added tp PrefilterTree, and then PrefilterTree is used to find all
8 // the unique strings across the prefilters. During search, by using
9 // matches from a string matching engine, PrefilterTree deduces the
10 // set of regexps that are to be triggered. The 'string matching
11 // engine' itself is outside of this class, and the caller can use any
12 // favorite engine. PrefilterTree provides a set of strings (called
13 // atoms) that the user of this class should use to do the string
14 // matching.
15 //
16 #ifndef RE2_PREFILTER_TREE_H_
17 #define RE2_PREFILTER_TREE_H_
18 
19 #include "util/util.h"
20 #include "util/sparse_array.h"
21 
22 namespace re2 {
23 
24 typedef SparseArray<int> IntMap;
25 
26 class Prefilter;
27 
28 class PrefilterTree {
29  public:
30   PrefilterTree();
31   ~PrefilterTree();
32 
33   // Adds the prefilter for the next regexp. Note that we assume that
34   // Add called sequentially for all regexps. All Add calls
35   // must precede Compile.
36   void Add(Prefilter* prefilter);
37 
38   // The Compile returns a vector of string in atom_vec.
39   // Call this after all the prefilters are added through Add.
40   // No calls to Add after Compile are allowed.
41   // The caller should use the returned set of strings to do string matching.
42   // Each time a string matches, the corresponding index then has to be
43   // and passed to RegexpsGivenStrings below.
44   void Compile(vector<string>* atom_vec);
45 
46   // Given the indices of the atoms that matched, returns the indexes
47   // of regexps that should be searched.  The matched_atoms should
48   // contain all the ids of string atoms that were found to match the
49   // content. The caller can use any string match engine to perform
50   // this function. This function is thread safe.
51   void RegexpsGivenStrings(const vector<int>& matched_atoms,
52                            vector<int>* regexps) const;
53 
54   // Print debug prefilter. Also prints unique ids associated with
55   // nodes of the prefilter of the regexp.
56   void PrintPrefilter(int regexpid);
57 
58 
59   // Each unique node has a corresponding Entry that helps in
60   // passing the matching trigger information along the tree.
61   struct Entry {
62    public:
63     // How many children should match before this node triggers the
64     // parent. For an atom and an OR node, this is 1 and for an AND
65     // node, it is the number of unique children.
66     int propagate_up_at_count;
67 
68     // When this node is ready to trigger the parent, what are the indices
69     // of the parent nodes to trigger. The reason there may be more than
70     // one is because of sharing. For example (abc | def) and (xyz | def)
71     // are two different nodes, but they share the atom 'def'. So when
72     // 'def' matches, it triggers two parents, corresponding to the two
73     // different OR nodes.
74     IntMap* parents;
75 
76     // When this node is ready to trigger the parent, what are the
77     // regexps that are triggered.
78     vector<int> regexps;
79   };
80 
81  private:
82   // This function assigns unique ids to various parts of the
83   // prefilter, by looking at if these nodes are already in the
84   // PrefilterTree.
85   void AssignUniqueIds(vector<string>* atom_vec);
86 
87   // Given the matching atoms, find the regexps to be triggered.
88   void PropagateMatch(const vector<int>& atom_ids,
89                       IntMap* regexps) const;
90 
91   // Returns the prefilter node that has the same NodeString as this
92   // node. For the canonical node, returns node.
93   Prefilter* CanonicalNode(Prefilter* node);
94 
95   // A string that uniquely identifies the node. Assumes that the
96   // children of node has already been assigned unique ids.
97   string NodeString(Prefilter* node) const;
98 
99   // Recursively constructs a readable prefilter string.
100   string DebugNodeString(Prefilter* node) const;
101 
102   // Used for debugging.
103   void PrintDebugInfo();
104 
105   // These are all the nodes formed by Compile. Essentially, there is
106   // one node for each unique atom and each unique AND/OR node.
107   vector<Entry> entries_;
108 
109   // Map node string to canonical Prefilter node.
110   map<string, Prefilter*> node_map_;
111 
112   // indices of regexps that always pass through the filter (since we
113   // found no required literals in these regexps).
114   vector<int> unfiltered_;
115 
116   // vector of Prefilter for all regexps.
117   vector<Prefilter*> prefilter_vec_;
118 
119   // Atom index in returned strings to entry id mapping.
120   vector<int> atom_index_to_id_;
121 
122   // Has the prefilter tree been compiled.
123   bool compiled_;
124 
125   DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
126 };
127 
128 }  // namespace
129 
130 #endif  // RE2_PREFILTER_TREE_H_
131