// Copyright 2009 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // The PrefilterTree class is used to form an AND-OR tree of strings // that would trigger each regexp. The 'prefilter' of each regexp is // added tp PrefilterTree, and then PrefilterTree is used to find all // the unique strings across the prefilters. During search, by using // matches from a string matching engine, PrefilterTree deduces the // set of regexps that are to be triggered. The 'string matching // engine' itself is outside of this class, and the caller can use any // favorite engine. PrefilterTree provides a set of strings (called // atoms) that the user of this class should use to do the string // matching. // #ifndef RE2_PREFILTER_TREE_H_ #define RE2_PREFILTER_TREE_H_ #include "util/util.h" #include "util/sparse_array.h" namespace re2 { typedef SparseArray IntMap; class Prefilter; class PrefilterTree { public: PrefilterTree(); ~PrefilterTree(); // Adds the prefilter for the next regexp. Note that we assume that // Add called sequentially for all regexps. All Add calls // must precede Compile. void Add(Prefilter* prefilter); // The Compile returns a vector of string in atom_vec. // Call this after all the prefilters are added through Add. // No calls to Add after Compile are allowed. // The caller should use the returned set of strings to do string matching. // Each time a string matches, the corresponding index then has to be // and passed to RegexpsGivenStrings below. void Compile(vector* atom_vec); // Given the indices of the atoms that matched, returns the indexes // of regexps that should be searched. The matched_atoms should // contain all the ids of string atoms that were found to match the // content. The caller can use any string match engine to perform // this function. This function is thread safe. void RegexpsGivenStrings(const vector& matched_atoms, vector* regexps) const; // Print debug prefilter. Also prints unique ids associated with // nodes of the prefilter of the regexp. void PrintPrefilter(int regexpid); // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. struct Entry { public: // How many children should match before this node triggers the // parent. For an atom and an OR node, this is 1 and for an AND // node, it is the number of unique children. int propagate_up_at_count; // When this node is ready to trigger the parent, what are the indices // of the parent nodes to trigger. The reason there may be more than // one is because of sharing. For example (abc | def) and (xyz | def) // are two different nodes, but they share the atom 'def'. So when // 'def' matches, it triggers two parents, corresponding to the two // different OR nodes. IntMap* parents; // When this node is ready to trigger the parent, what are the // regexps that are triggered. vector regexps; }; private: // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. void AssignUniqueIds(vector* atom_vec); // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const vector& atom_ids, IntMap* regexps) const; // Returns the prefilter node that has the same NodeString as this // node. For the canonical node, returns node. Prefilter* CanonicalNode(Prefilter* node); // A string that uniquely identifies the node. Assumes that the // children of node has already been assigned unique ids. string NodeString(Prefilter* node) const; // Recursively constructs a readable prefilter string. string DebugNodeString(Prefilter* node) const; // Used for debugging. void PrintDebugInfo(); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. vector entries_; // Map node string to canonical Prefilter node. map node_map_; // indices of regexps that always pass through the filter (since we // found no required literals in these regexps). vector unfiltered_; // vector of Prefilter for all regexps. vector prefilter_vec_; // Atom index in returned strings to entry id mapping. vector atom_index_to_id_; // Has the prefilter tree been compiled. bool compiled_; DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); }; } // namespace #endif // RE2_PREFILTER_TREE_H_