1 //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // TrigramIndex implements a heuristic for SpecialCaseList that allows to
11 // filter out ~99% incoming queries when all regular expressions in the
12 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
13 // complicated, the check is defeated and it will always pass the queries to a
14 // full regex.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/Support/TrigramIndex.h"
19 #include "llvm/ADT/SmallVector.h"
20 
21 #include <set>
22 #include <string>
23 #include <unordered_map>
24 
25 using namespace llvm;
26 
27 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
28 
isAdvancedMetachar(unsigned Char)29 static bool isAdvancedMetachar(unsigned Char) {
30   return strchr(RegexAdvancedMetachars, Char) != nullptr;
31 }
32 
insert(std::string Regex)33 void TrigramIndex::insert(std::string Regex) {
34   if (Defeated) return;
35   std::set<unsigned> Was;
36   unsigned Cnt = 0;
37   unsigned Tri = 0;
38   unsigned Len = 0;
39   bool Escaped = false;
40   for (unsigned Char : Regex) {
41     if (!Escaped) {
42       // Regular expressions allow escaping symbols by preceding it with '\'.
43       if (Char == '\\') {
44         Escaped = true;
45         continue;
46       }
47       if (isAdvancedMetachar(Char)) {
48         // This is a more complicated regex than we can handle here.
49         Defeated = true;
50         return;
51       }
52       if (Char == '.' || Char == '*') {
53         Tri = 0;
54         Len = 0;
55         continue;
56       }
57     }
58     if (Escaped && Char >= '1' && Char <= '9') {
59       Defeated = true;
60       return;
61     }
62     // We have already handled escaping and can reset the flag.
63     Escaped = false;
64     Tri = ((Tri << 8) + Char) & 0xFFFFFF;
65     Len++;
66     if (Len < 3)
67       continue;
68     // We don't want the index to grow too much for the popular trigrams,
69     // as they are weak signals. It's ok to still require them for the
70     // rules we have already processed. It's just a small additional
71     // computational cost.
72     if (Index[Tri].size() >= 4)
73       continue;
74     Cnt++;
75     if (!Was.count(Tri)) {
76       // Adding the current rule to the index.
77       Index[Tri].push_back(Counts.size());
78       Was.insert(Tri);
79     }
80   }
81   if (!Cnt) {
82     // This rule does not have remarkable trigrams to rely on.
83     // We have to always call the full regex chain.
84     Defeated = true;
85     return;
86   }
87   Counts.push_back(Cnt);
88 }
89 
isDefinitelyOut(StringRef Query) const90 bool TrigramIndex::isDefinitelyOut(StringRef Query) const {
91   if (Defeated)
92     return false;
93   std::vector<unsigned> CurCounts(Counts.size());
94   unsigned Tri = 0;
95   for (size_t I = 0; I < Query.size(); I++) {
96     Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF;
97     if (I < 2)
98       continue;
99     const auto &II = Index.find(Tri);
100     if (II == Index.end())
101       continue;
102     for (size_t J : II->second) {
103       CurCounts[J]++;
104       // If we have reached a desired limit, we have to look at the query
105       // more closely by running a full regex.
106       if (CurCounts[J] >= Counts[J])
107         return false;
108     }
109   }
110   return true;
111 }
112