1 use std::cmp::Ordering;
2 use std::collections::HashMap;
3 use std::fmt;
4 use std::mem;
5 use std::ops::Deref;
6 use std::slice;
7 use std::sync::Arc;
8 
9 use input::Char;
10 use literal::LiteralSearcher;
11 
12 /// `InstPtr` represents the index of an instruction in a regex program.
13 pub type InstPtr = usize;
14 
15 /// Program is a sequence of instructions and various facts about thos
16 /// instructions.
17 #[derive(Clone)]
18 pub struct Program {
19     /// A sequence of instructions that represents an NFA.
20     pub insts: Vec<Inst>,
21     /// Pointers to each Match instruction in the sequence.
22     ///
23     /// This is always length 1 unless this program represents a regex set.
24     pub matches: Vec<InstPtr>,
25     /// The ordered sequence of all capture groups extracted from the AST.
26     /// Unnamed groups are `None`.
27     pub captures: Vec<Option<String>>,
28     /// Pointers to all named capture groups into `captures`.
29     pub capture_name_idx: Arc<HashMap<String, usize>>,
30     /// A pointer to the start instruction. This can vary depending on how
31     /// the program was compiled. For example, programs for use with the DFA
32     /// engine have a `.*?` inserted at the beginning of unanchored regular
33     /// expressions. The actual starting point of the program is after the
34     /// `.*?`.
35     pub start: InstPtr,
36     /// A set of equivalence classes for discriminating bytes in the compiled
37     /// program.
38     pub byte_classes: Vec<u8>,
39     /// When true, this program can only match valid UTF-8.
40     pub only_utf8: bool,
41     /// When true, this program uses byte range instructions instead of Unicode
42     /// range instructions.
43     pub is_bytes: bool,
44     /// When true, the program is compiled for DFA matching. For example, this
45     /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
46     /// regexes.
47     pub is_dfa: bool,
48     /// When true, the program matches text in reverse (for use only in the
49     /// DFA).
50     pub is_reverse: bool,
51     /// Whether the regex must match from the start of the input.
52     pub is_anchored_start: bool,
53     /// Whether the regex must match at the end of the input.
54     pub is_anchored_end: bool,
55     /// Whether this program contains a Unicode word boundary instruction.
56     pub has_unicode_word_boundary: bool,
57     /// A possibly empty machine for very quickly matching prefix literals.
58     pub prefixes: LiteralSearcher,
59     /// A limit on the size of the cache that the DFA is allowed to use while
60     /// matching.
61     ///
62     /// The cache limit specifies approximately how much space we're willing to
63     /// give to the state cache. Once the state cache exceeds the size, it is
64     /// wiped and all states must be re-computed.
65     ///
66     /// Note that this value does not impact correctness. It can be set to 0
67     /// and the DFA will run just fine. (It will only ever store exactly one
68     /// state in the cache, and will likely run very slowly, but it will work.)
69     ///
70     /// Also note that this limit is *per thread of execution*. That is,
71     /// if the same regex is used to search text across multiple threads
72     /// simultaneously, then the DFA cache is not shared. Instead, copies are
73     /// made.
74     pub dfa_size_limit: usize,
75 }
76 
77 impl Program {
78     /// Creates an empty instruction sequence. Fields are given default
79     /// values.
new() -> Self80     pub fn new() -> Self {
81         Program {
82             insts: vec![],
83             matches: vec![],
84             captures: vec![],
85             capture_name_idx: Arc::new(HashMap::new()),
86             start: 0,
87             byte_classes: vec![0; 256],
88             only_utf8: true,
89             is_bytes: false,
90             is_dfa: false,
91             is_reverse: false,
92             is_anchored_start: false,
93             is_anchored_end: false,
94             has_unicode_word_boundary: false,
95             prefixes: LiteralSearcher::empty(),
96             dfa_size_limit: 2 * (1 << 20),
97         }
98     }
99 
100     /// If pc is an index to a no-op instruction (like Save), then return the
101     /// next pc that is not a no-op instruction.
skip(&self, mut pc: usize) -> usize102     pub fn skip(&self, mut pc: usize) -> usize {
103         loop {
104             match self[pc] {
105                 Inst::Save(ref i) => pc = i.goto,
106                 _ => return pc,
107             }
108         }
109     }
110 
111     /// Return true if and only if an execution engine at instruction `pc` will
112     /// always lead to a match.
leads_to_match(&self, pc: usize) -> bool113     pub fn leads_to_match(&self, pc: usize) -> bool {
114         if self.matches.len() > 1 {
115             // If we have a regex set, then we have more than one ending
116             // state, so leading to one of those states is generally
117             // meaningless.
118             return false;
119         }
120         match self[self.skip(pc)] {
121             Inst::Match(_) => true,
122             _ => false,
123         }
124     }
125 
126     /// Returns true if the current configuration demands that an implicit
127     /// `.*?` be prepended to the instruction sequence.
needs_dotstar(&self) -> bool128     pub fn needs_dotstar(&self) -> bool {
129         self.is_dfa && !self.is_reverse && !self.is_anchored_start
130     }
131 
132     /// Returns true if this program uses Byte instructions instead of
133     /// Char/Range instructions.
uses_bytes(&self) -> bool134     pub fn uses_bytes(&self) -> bool {
135         self.is_bytes || self.is_dfa
136     }
137 
138     /// Returns true if this program exclusively matches valid UTF-8 bytes.
139     ///
140     /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
only_utf8(&self) -> bool141     pub fn only_utf8(&self) -> bool {
142         self.only_utf8
143     }
144 
145     /// Return the approximate heap usage of this instruction sequence in
146     /// bytes.
approximate_size(&self) -> usize147     pub fn approximate_size(&self) -> usize {
148         // The only instruction that uses heap space is Ranges (for
149         // Unicode codepoint programs) to store non-overlapping codepoint
150         // ranges. To keep this operation constant time, we ignore them.
151         (self.len() * mem::size_of::<Inst>())
152             + (self.matches.len() * mem::size_of::<InstPtr>())
153             + (self.captures.len() * mem::size_of::<Option<String>>())
154             + (self.capture_name_idx.len()
155                 * (mem::size_of::<String>() + mem::size_of::<usize>()))
156             + (self.byte_classes.len() * mem::size_of::<u8>())
157             + self.prefixes.approximate_size()
158     }
159 }
160 
161 impl Deref for Program {
162     type Target = [Inst];
163 
164     #[cfg_attr(feature = "perf-inline", inline(always))]
deref(&self) -> &Self::Target165     fn deref(&self) -> &Self::Target {
166         &*self.insts
167     }
168 }
169 
170 impl fmt::Debug for Program {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result171     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
172         use self::Inst::*;
173 
174         fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
175             if goto == cur + 1 {
176                 fmtd
177             } else {
178                 format!("{} (goto: {})", fmtd, goto)
179             }
180         }
181 
182         fn visible_byte(b: u8) -> String {
183             use std::ascii::escape_default;
184             let escaped = escape_default(b).collect::<Vec<u8>>();
185             String::from_utf8_lossy(&escaped).into_owned()
186         }
187 
188         for (pc, inst) in self.iter().enumerate() {
189             match *inst {
190                 Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
191                 Save(ref inst) => {
192                     let s = format!("{:04} Save({})", pc, inst.slot);
193                     write!(f, "{}", with_goto(pc, inst.goto, s))?;
194                 }
195                 Split(ref inst) => {
196                     write!(
197                         f,
198                         "{:04} Split({}, {})",
199                         pc, inst.goto1, inst.goto2
200                     )?;
201                 }
202                 EmptyLook(ref inst) => {
203                     let s = format!("{:?}", inst.look);
204                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
205                 }
206                 Char(ref inst) => {
207                     let s = format!("{:?}", inst.c);
208                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
209                 }
210                 Ranges(ref inst) => {
211                     let ranges = inst
212                         .ranges
213                         .iter()
214                         .map(|r| format!("{:?}-{:?}", r.0, r.1))
215                         .collect::<Vec<String>>()
216                         .join(", ");
217                     write!(
218                         f,
219                         "{:04} {}",
220                         pc,
221                         with_goto(pc, inst.goto, ranges)
222                     )?;
223                 }
224                 Bytes(ref inst) => {
225                     let s = format!(
226                         "Bytes({}, {})",
227                         visible_byte(inst.start),
228                         visible_byte(inst.end)
229                     );
230                     write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
231                 }
232             }
233             if pc == self.start {
234                 write!(f, " (start)")?;
235             }
236             write!(f, "\n")?;
237         }
238         Ok(())
239     }
240 }
241 
242 impl<'a> IntoIterator for &'a Program {
243     type Item = &'a Inst;
244     type IntoIter = slice::Iter<'a, Inst>;
into_iter(self) -> Self::IntoIter245     fn into_iter(self) -> Self::IntoIter {
246         self.iter()
247     }
248 }
249 
250 /// Inst is an instruction code in a Regex program.
251 ///
252 /// Regrettably, a regex program either contains Unicode codepoint
253 /// instructions (Char and Ranges) or it contains byte instructions (Bytes).
254 /// A regex program can never contain both.
255 ///
256 /// It would be worth investigating splitting this into two distinct types and
257 /// then figuring out how to make the matching engines polymorphic over those
258 /// types without sacrificing performance.
259 ///
260 /// Other than the benefit of moving invariants into the type system, another
261 /// benefit is the decreased size. If we remove the `Char` and `Ranges`
262 /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
263 /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
264 /// variant.) Given that byte based machines are typically much bigger than
265 /// their Unicode analogues (because they can decode UTF-8 directly), this ends
266 /// up being a pretty significant savings.
267 #[derive(Clone, Debug)]
268 pub enum Inst {
269     /// Match indicates that the program has reached a match state.
270     ///
271     /// The number in the match corresponds to the Nth logical regular
272     /// expression in this program. This index is always 0 for normal regex
273     /// programs. Values greater than 0 appear when compiling regex sets, and
274     /// each match instruction gets its own unique value. The value corresponds
275     /// to the Nth regex in the set.
276     Match(usize),
277     /// Save causes the program to save the current location of the input in
278     /// the slot indicated by InstSave.
279     Save(InstSave),
280     /// Split causes the program to diverge to one of two paths in the
281     /// program, preferring goto1 in InstSplit.
282     Split(InstSplit),
283     /// EmptyLook represents a zero-width assertion in a regex program. A
284     /// zero-width assertion does not consume any of the input text.
285     EmptyLook(InstEmptyLook),
286     /// Char requires the regex program to match the character in InstChar at
287     /// the current position in the input.
288     Char(InstChar),
289     /// Ranges requires the regex program to match the character at the current
290     /// position in the input with one of the ranges specified in InstRanges.
291     Ranges(InstRanges),
292     /// Bytes is like Ranges, except it expresses a single byte range. It is
293     /// used in conjunction with Split instructions to implement multi-byte
294     /// character classes.
295     Bytes(InstBytes),
296 }
297 
298 impl Inst {
299     /// Returns true if and only if this is a match instruction.
is_match(&self) -> bool300     pub fn is_match(&self) -> bool {
301         match *self {
302             Inst::Match(_) => true,
303             _ => false,
304         }
305     }
306 }
307 
308 /// Representation of the Save instruction.
309 #[derive(Clone, Debug)]
310 pub struct InstSave {
311     /// The next location to execute in the program.
312     pub goto: InstPtr,
313     /// The capture slot (there are two slots for every capture in a regex,
314     /// including the zeroth capture for the entire match).
315     pub slot: usize,
316 }
317 
318 /// Representation of the Split instruction.
319 #[derive(Clone, Debug)]
320 pub struct InstSplit {
321     /// The first instruction to try. A match resulting from following goto1
322     /// has precedence over a match resulting from following goto2.
323     pub goto1: InstPtr,
324     /// The second instruction to try. A match resulting from following goto1
325     /// has precedence over a match resulting from following goto2.
326     pub goto2: InstPtr,
327 }
328 
329 /// Representation of the `EmptyLook` instruction.
330 #[derive(Clone, Debug)]
331 pub struct InstEmptyLook {
332     /// The next location to execute in the program if this instruction
333     /// succeeds.
334     pub goto: InstPtr,
335     /// The type of zero-width assertion to check.
336     pub look: EmptyLook,
337 }
338 
339 /// The set of zero-width match instructions.
340 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
341 pub enum EmptyLook {
342     /// Start of line or input.
343     StartLine,
344     /// End of line or input.
345     EndLine,
346     /// Start of input.
347     StartText,
348     /// End of input.
349     EndText,
350     /// Word character on one side and non-word character on other.
351     WordBoundary,
352     /// Word character on both sides or non-word character on both sides.
353     NotWordBoundary,
354     /// ASCII word boundary.
355     WordBoundaryAscii,
356     /// Not ASCII word boundary.
357     NotWordBoundaryAscii,
358 }
359 
360 /// Representation of the Char instruction.
361 #[derive(Clone, Debug)]
362 pub struct InstChar {
363     /// The next location to execute in the program if this instruction
364     /// succeeds.
365     pub goto: InstPtr,
366     /// The character to test.
367     pub c: char,
368 }
369 
370 /// Representation of the Ranges instruction.
371 #[derive(Clone, Debug)]
372 pub struct InstRanges {
373     /// The next location to execute in the program if this instruction
374     /// succeeds.
375     pub goto: InstPtr,
376     /// The set of Unicode scalar value ranges to test.
377     pub ranges: Vec<(char, char)>,
378 }
379 
380 impl InstRanges {
381     /// Tests whether the given input character matches this instruction.
matches(&self, c: Char) -> bool382     pub fn matches(&self, c: Char) -> bool {
383         // This speeds up the `match_class_unicode` benchmark by checking
384         // some common cases quickly without binary search. e.g., Matching
385         // a Unicode class on predominantly ASCII text.
386         for r in self.ranges.iter().take(4) {
387             if c < r.0 {
388                 return false;
389             }
390             if c <= r.1 {
391                 return true;
392             }
393         }
394         self.ranges
395             .binary_search_by(|r| {
396                 if r.1 < c {
397                     Ordering::Less
398                 } else if r.0 > c {
399                     Ordering::Greater
400                 } else {
401                     Ordering::Equal
402                 }
403             })
404             .is_ok()
405     }
406 
407     /// Return the number of distinct characters represented by all of the
408     /// ranges.
num_chars(&self) -> usize409     pub fn num_chars(&self) -> usize {
410         self.ranges
411             .iter()
412             .map(|&(s, e)| 1 + (e as u32) - (s as u32))
413             .sum::<u32>() as usize
414     }
415 }
416 
417 /// Representation of the Bytes instruction.
418 #[derive(Clone, Debug)]
419 pub struct InstBytes {
420     /// The next location to execute in the program if this instruction
421     /// succeeds.
422     pub goto: InstPtr,
423     /// The start (inclusive) of this byte range.
424     pub start: u8,
425     /// The end (inclusive) of this byte range.
426     pub end: u8,
427 }
428 
429 impl InstBytes {
430     /// Returns true if and only if the given byte is in this range.
matches(&self, byte: u8) -> bool431     pub fn matches(&self, byte: u8) -> bool {
432         self.start <= byte && byte <= self.end
433     }
434 }
435