1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7 
8 #include "util/util.h"
9 #include "util/sparse_set.h"
10 #include "re2/prog.h"
11 #include "re2/stringpiece.h"
12 
13 namespace re2 {
14 
15 // Constructors per Inst opcode
16 
InitAlt(uint32 out,uint32 out1)17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
18   DCHECK_EQ(out_opcode_, 0);
19   set_out_opcode(out, kInstAlt);
20   out1_ = out1;
21 }
22 
InitByteRange(int lo,int hi,int foldcase,uint32 out)23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
24   DCHECK_EQ(out_opcode_, 0);
25   set_out_opcode(out, kInstByteRange);
26   lo_ = lo & 0xFF;
27   hi_ = hi & 0xFF;
28   foldcase_ = foldcase;
29 }
30 
InitCapture(int cap,uint32 out)31 void Prog::Inst::InitCapture(int cap, uint32 out) {
32   DCHECK_EQ(out_opcode_, 0);
33   set_out_opcode(out, kInstCapture);
34   cap_ = cap;
35 }
36 
InitEmptyWidth(EmptyOp empty,uint32 out)37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
38   DCHECK_EQ(out_opcode_, 0);
39   set_out_opcode(out, kInstEmptyWidth);
40   empty_ = empty;
41 }
42 
InitMatch(int32 id)43 void Prog::Inst::InitMatch(int32 id) {
44   DCHECK_EQ(out_opcode_, 0);
45   set_opcode(kInstMatch);
46   match_id_ = id;
47 }
48 
InitNop(uint32 out)49 void Prog::Inst::InitNop(uint32 out) {
50   DCHECK_EQ(out_opcode_, 0);
51   set_opcode(kInstNop);
52 }
53 
InitFail()54 void Prog::Inst::InitFail() {
55   DCHECK_EQ(out_opcode_, 0);
56   set_opcode(kInstFail);
57 }
58 
Dump()59 string Prog::Inst::Dump() {
60   switch (opcode()) {
61     default:
62       return StringPrintf("opcode %d", static_cast<int>(opcode()));
63 
64     case kInstAlt:
65       return StringPrintf("alt -> %d | %d", out(), out1_);
66 
67     case kInstAltMatch:
68       return StringPrintf("altmatch -> %d | %d", out(), out1_);
69 
70     case kInstByteRange:
71       return StringPrintf("byte%s [%02x-%02x] -> %d",
72                           foldcase_ ? "/i" : "",
73                           lo_, hi_, out());
74 
75     case kInstCapture:
76       return StringPrintf("capture %d -> %d", cap_, out());
77 
78     case kInstEmptyWidth:
79       return StringPrintf("emptywidth %#x -> %d",
80                           static_cast<int>(empty_), out());
81 
82     case kInstMatch:
83       return StringPrintf("match! %d", match_id());
84 
85     case kInstNop:
86       return StringPrintf("nop -> %d", out());
87 
88     case kInstFail:
89       return StringPrintf("fail");
90   }
91 }
92 
Prog()93 Prog::Prog()
94   : anchor_start_(false),
95     anchor_end_(false),
96     reversed_(false),
97     did_onepass_(false),
98     start_(0),
99     start_unanchored_(0),
100     size_(0),
101     byte_inst_count_(0),
102     bytemap_range_(0),
103     flags_(0),
104     onepass_statesize_(0),
105     inst_(NULL),
106     dfa_first_(NULL),
107     dfa_longest_(NULL),
108     dfa_mem_(0),
109     delete_dfa_(NULL),
110     unbytemap_(NULL),
111     onepass_nodes_(NULL),
112     onepass_start_(NULL) {
113 }
114 
~Prog()115 Prog::~Prog() {
116   if (delete_dfa_) {
117     if (dfa_first_)
118       delete_dfa_(dfa_first_);
119     if (dfa_longest_)
120       delete_dfa_(dfa_longest_);
121   }
122   delete[] onepass_nodes_;
123   delete[] inst_;
124   delete[] unbytemap_;
125 }
126 
127 typedef SparseSet Workq;
128 
AddToQueue(Workq * q,int id)129 static inline void AddToQueue(Workq* q, int id) {
130   if (id != 0)
131     q->insert(id);
132 }
133 
ProgToString(Prog * prog,Workq * q)134 static string ProgToString(Prog* prog, Workq* q) {
135   string s;
136 
137   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
138     int id = *i;
139     Prog::Inst* ip = prog->inst(id);
140     StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
141     AddToQueue(q, ip->out());
142     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
143       AddToQueue(q, ip->out1());
144   }
145   return s;
146 }
147 
Dump()148 string Prog::Dump() {
149   string map;
150   if (false) {  // Debugging
151     int lo = 0;
152     StringAppendF(&map, "byte map:\n");
153     for (int i = 0; i < bytemap_range_; i++) {
154       StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
155       lo = unbytemap_[i] + 1;
156     }
157     StringAppendF(&map, "\n");
158   }
159 
160   Workq q(size_);
161   AddToQueue(&q, start_);
162   return map + ProgToString(this, &q);
163 }
164 
DumpUnanchored()165 string Prog::DumpUnanchored() {
166   Workq q(size_);
167   AddToQueue(&q, start_unanchored_);
168   return ProgToString(this, &q);
169 }
170 
171 static bool IsMatch(Prog*, Prog::Inst*);
172 
173 // Peep-hole optimizer.
Optimize()174 void Prog::Optimize() {
175   Workq q(size_);
176 
177   // Eliminate nops.  Most are taken out during compilation
178   // but a few are hard to avoid.
179   q.clear();
180   AddToQueue(&q, start_);
181   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
182     int id = *i;
183 
184     Inst* ip = inst(id);
185     int j = ip->out();
186     Inst* jp;
187     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
188       j = jp->out();
189     }
190     ip->set_out(j);
191     AddToQueue(&q, ip->out());
192 
193     if (ip->opcode() == kInstAlt) {
194       j = ip->out1();
195       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
196         j = jp->out();
197       }
198       ip->out1_ = j;
199       AddToQueue(&q, ip->out1());
200     }
201   }
202 
203   // Insert kInstAltMatch instructions
204   // Look for
205   //   ip: Alt -> j | k
206   //	  j: ByteRange [00-FF] -> ip
207   //    k: Match
208   // or the reverse (the above is the greedy one).
209   // Rewrite Alt to AltMatch.
210   q.clear();
211   AddToQueue(&q, start_);
212   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
213     int id = *i;
214     Inst* ip = inst(id);
215     AddToQueue(&q, ip->out());
216     if (ip->opcode() == kInstAlt)
217       AddToQueue(&q, ip->out1());
218 
219     if (ip->opcode() == kInstAlt) {
220       Inst* j = inst(ip->out());
221       Inst* k = inst(ip->out1());
222       if (j->opcode() == kInstByteRange && j->out() == id &&
223           j->lo() == 0x00 && j->hi() == 0xFF &&
224           IsMatch(this, k)) {
225         ip->set_opcode(kInstAltMatch);
226         continue;
227       }
228       if (IsMatch(this, j) &&
229           k->opcode() == kInstByteRange && k->out() == id &&
230           k->lo() == 0x00 && k->hi() == 0xFF) {
231         ip->set_opcode(kInstAltMatch);
232       }
233     }
234   }
235 }
236 
237 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)238 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
239   for (;;) {
240     switch (ip->opcode()) {
241       default:
242         LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
243         return false;
244 
245       case kInstAlt:
246       case kInstAltMatch:
247       case kInstByteRange:
248       case kInstFail:
249       case kInstEmptyWidth:
250         return false;
251 
252       case kInstCapture:
253       case kInstNop:
254         ip = prog->inst(ip->out());
255         break;
256 
257       case kInstMatch:
258         return true;
259     }
260   }
261 }
262 
EmptyFlags(const StringPiece & text,const char * p)263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
264   int flags = 0;
265 
266   // ^ and \A
267   if (p == text.begin())
268     flags |= kEmptyBeginText | kEmptyBeginLine;
269   else if (p[-1] == '\n')
270     flags |= kEmptyBeginLine;
271 
272   // $ and \z
273   if (p == text.end())
274     flags |= kEmptyEndText | kEmptyEndLine;
275   else if (p < text.end() && p[0] == '\n')
276     flags |= kEmptyEndLine;
277 
278   // \b and \B
279   if (p == text.begin() && p == text.end()) {
280     // no word boundary here
281   } else if (p == text.begin()) {
282     if (IsWordChar(p[0]))
283       flags |= kEmptyWordBoundary;
284   } else if (p == text.end()) {
285     if (IsWordChar(p[-1]))
286       flags |= kEmptyWordBoundary;
287   } else {
288     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
289       flags |= kEmptyWordBoundary;
290   }
291   if (!(flags & kEmptyWordBoundary))
292     flags |= kEmptyNonWordBoundary;
293 
294   return flags;
295 }
296 
MarkByteRange(int lo,int hi)297 void Prog::MarkByteRange(int lo, int hi) {
298   CHECK_GE(lo, 0);
299   CHECK_GE(hi, 0);
300   CHECK_LE(lo, 255);
301   CHECK_LE(hi, 255);
302   if (lo > 0)
303     byterange_.Set(lo - 1);
304   byterange_.Set(hi);
305 }
306 
ComputeByteMap()307 void Prog::ComputeByteMap() {
308   // Fill in bytemap with byte classes for prog_.
309   // Ranges of bytes that are treated as indistinguishable
310   // by the regexp program are mapped to a single byte class.
311   // The vector prog_->byterange() marks the end of each
312   // such range.
313   const Bitmap<256>& v = byterange();
314 
315   COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
316   uint8 n = 0;
317   uint32 bits = 0;
318   for (int i = 0; i < 256; i++) {
319     if ((i&31) == 0)
320       bits = v.Word(i >> 5);
321     bytemap_[i] = n;
322     n += bits & 1;
323     bits >>= 1;
324   }
325   bytemap_range_ = bytemap_[255] + 1;
326   unbytemap_ = new uint8[bytemap_range_];
327   for (int i = 0; i < 256; i++)
328     unbytemap_[bytemap_[i]] = i;
329 
330   if (0) {  // For debugging: use trivial byte map.
331     for (int i = 0; i < 256; i++) {
332       bytemap_[i] = i;
333       unbytemap_[i] = i;
334     }
335     bytemap_range_ = 256;
336     LOG(INFO) << "Using trivial bytemap.";
337   }
338 }
339 
340 }  // namespace re2
341 
342