• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  use std::cell::RefCell;
2  use std::collections::HashMap;
3  use std::panic::AssertUnwindSafe;
4  use std::sync::Arc;
5  
6  #[cfg(feature = "perf-literal")]
7  use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8  use syntax::hir::literal::Literals;
9  use syntax::hir::Hir;
10  use syntax::ParserBuilder;
11  
12  use backtrack;
13  use compile::Compiler;
14  #[cfg(feature = "perf-dfa")]
15  use dfa;
16  use error::Error;
17  use input::{ByteInput, CharInput};
18  use literal::LiteralSearcher;
19  use pikevm;
20  use pool::{Pool, PoolGuard};
21  use prog::Program;
22  use re_builder::RegexOptions;
23  use re_bytes;
24  use re_set;
25  use re_trait::{Locations, RegularExpression, Slot};
26  use re_unicode;
27  use utf8::next_utf8;
28  
29  /// `Exec` manages the execution of a regular expression.
30  ///
31  /// In particular, this manages the various compiled forms of a single regular
32  /// expression and the choice of which matching engine to use to execute a
33  /// regular expression.
34  #[derive(Debug)]
35  pub struct Exec {
36      /// All read only state.
37      ro: Arc<ExecReadOnly>,
38      /// A pool of reusable values for the various matching engines.
39      ///
40      /// Note that boxing this value is not strictly necessary, but it is an
41      /// easy way to ensure that T does not bloat the stack sized used by a pool
42      /// in the case where T is big. And this turns out to be the case at the
43      /// time of writing for regex's use of this pool. At the time of writing,
44      /// the size of a Regex on the stack is 856 bytes. Boxing this value
45      /// reduces that size to 16 bytes.
46      pool: Box<Pool<ProgramCache>>,
47  }
48  
49  /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50  /// means it is no longer Sync, but we can now avoid the overhead of
51  /// synchronization to fetch the cache.
52  #[derive(Debug)]
53  pub struct ExecNoSync<'c> {
54      /// All read only state.
55      ro: &'c Arc<ExecReadOnly>,
56      /// Caches for the various matching engines.
57      cache: PoolGuard<'c, ProgramCache>,
58  }
59  
60  /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61  #[derive(Debug)]
62  pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63  
64  /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65  /// state is determined at compile time and never changes during search.
66  #[derive(Debug)]
67  struct ExecReadOnly {
68      /// The original regular expressions given by the caller to compile.
69      res: Vec<String>,
70      /// A compiled program that is used in the NFA simulation and backtracking.
71      /// It can be byte-based or Unicode codepoint based.
72      ///
73      /// N.B. It is not possibly to make this byte-based from the public API.
74      /// It is only used for testing byte based programs in the NFA simulations.
75      nfa: Program,
76      /// A compiled byte based program for DFA execution. This is only used
77      /// if a DFA can be executed. (Currently, only word boundary assertions are
78      /// not supported.) Note that this program contains an embedded `.*?`
79      /// preceding the first capture group, unless the regex is anchored at the
80      /// beginning.
81      dfa: Program,
82      /// The same as above, except the program is reversed (and there is no
83      /// preceding `.*?`). This is used by the DFA to find the starting location
84      /// of matches.
85      dfa_reverse: Program,
86      /// A set of suffix literals extracted from the regex.
87      ///
88      /// Prefix literals are stored on the `Program`, since they are used inside
89      /// the matching engines.
90      suffixes: LiteralSearcher,
91      /// An Aho-Corasick automaton with leftmost-first match semantics.
92      ///
93      /// This is only set when the entire regex is a simple unanchored
94      /// alternation of literals. We could probably use it more circumstances,
95      /// but this is already hacky enough in this architecture.
96      ///
97      /// N.B. We use u32 as a state ID representation under the assumption that
98      /// if we were to exhaust the ID space, we probably would have long
99      /// surpassed the compilation size limit.
100      #[cfg(feature = "perf-literal")]
101      ac: Option<AhoCorasick<u32>>,
102      /// match_type encodes as much upfront knowledge about how we're going to
103      /// execute a search as possible.
104      match_type: MatchType,
105  }
106  
107  /// Facilitates the construction of an executor by exposing various knobs
108  /// to control how a regex is executed and what kinds of resources it's
109  /// permitted to use.
110  // `ExecBuilder` is only public via the `internal` module, so avoid deriving
111  // `Debug`.
112  #[allow(missing_debug_implementations)]
113  pub struct ExecBuilder {
114      options: RegexOptions,
115      match_type: Option<MatchType>,
116      bytes: bool,
117      only_utf8: bool,
118  }
119  
120  /// Parsed represents a set of parsed regular expressions and their detected
121  /// literals.
122  struct Parsed {
123      exprs: Vec<Hir>,
124      prefixes: Literals,
125      suffixes: Literals,
126      bytes: bool,
127  }
128  
129  impl ExecBuilder {
130      /// Create a regex execution builder.
131      ///
132      /// This uses default settings for everything except the regex itself,
133      /// which must be provided. Further knobs can be set by calling methods,
134      /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self135      pub fn new(re: &str) -> Self {
136          Self::new_many(&[re])
137      }
138  
139      /// Like new, but compiles the union of the given regular expressions.
140      ///
141      /// Note that when compiling 2 or more regular expressions, capture groups
142      /// are completely unsupported. (This means both `find` and `captures`
143      /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144      pub fn new_many<I, S>(res: I) -> Self
145      where
146          S: AsRef<str>,
147          I: IntoIterator<Item = S>,
148      {
149          let mut opts = RegexOptions::default();
150          opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151          Self::new_options(opts)
152      }
153  
154      /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self155      pub fn new_options(opts: RegexOptions) -> Self {
156          ExecBuilder {
157              options: opts,
158              match_type: None,
159              bytes: false,
160              only_utf8: true,
161          }
162      }
163  
164      /// Set the matching engine to be automatically determined.
165      ///
166      /// This is the default state and will apply whatever optimizations are
167      /// possible, such as running a DFA.
168      ///
169      /// This overrides whatever was previously set via the `nfa` or
170      /// `bounded_backtracking` methods.
automatic(mut self) -> Self171      pub fn automatic(mut self) -> Self {
172          self.match_type = None;
173          self
174      }
175  
176      /// Sets the matching engine to use the NFA algorithm no matter what
177      /// optimizations are possible.
178      ///
179      /// This overrides whatever was previously set via the `automatic` or
180      /// `bounded_backtracking` methods.
nfa(mut self) -> Self181      pub fn nfa(mut self) -> Self {
182          self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183          self
184      }
185  
186      /// Sets the matching engine to use a bounded backtracking engine no
187      /// matter what optimizations are possible.
188      ///
189      /// One must use this with care, since the bounded backtracking engine
190      /// uses memory proportion to `len(regex) * len(text)`.
191      ///
192      /// This overrides whatever was previously set via the `automatic` or
193      /// `nfa` methods.
bounded_backtracking(mut self) -> Self194      pub fn bounded_backtracking(mut self) -> Self {
195          self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196          self
197      }
198  
199      /// Compiles byte based programs for use with the NFA matching engines.
200      ///
201      /// By default, the NFA engines match on Unicode scalar values. They can
202      /// be made to use byte based programs instead. In general, the byte based
203      /// programs are slower because of a less efficient encoding of character
204      /// classes.
205      ///
206      /// Note that this does not impact DFA matching engines, which always
207      /// execute on bytes.
bytes(mut self, yes: bool) -> Self208      pub fn bytes(mut self, yes: bool) -> Self {
209          self.bytes = yes;
210          self
211      }
212  
213      /// When disabled, the program compiled may match arbitrary bytes.
214      ///
215      /// When enabled (the default), all compiled programs exclusively match
216      /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self217      pub fn only_utf8(mut self, yes: bool) -> Self {
218          self.only_utf8 = yes;
219          self
220      }
221  
222      /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self223      pub fn unicode(mut self, yes: bool) -> Self {
224          self.options.unicode = yes;
225          self
226      }
227  
228      /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>229      fn parse(&self) -> Result<Parsed, Error> {
230          let mut exprs = Vec::with_capacity(self.options.pats.len());
231          let mut prefixes = Some(Literals::empty());
232          let mut suffixes = Some(Literals::empty());
233          let mut bytes = false;
234          let is_set = self.options.pats.len() > 1;
235          // If we're compiling a regex set and that set has any anchored
236          // expressions, then disable all literal optimizations.
237          for pat in &self.options.pats {
238              let mut parser = ParserBuilder::new()
239                  .octal(self.options.octal)
240                  .case_insensitive(self.options.case_insensitive)
241                  .multi_line(self.options.multi_line)
242                  .dot_matches_new_line(self.options.dot_matches_new_line)
243                  .swap_greed(self.options.swap_greed)
244                  .ignore_whitespace(self.options.ignore_whitespace)
245                  .unicode(self.options.unicode)
246                  .allow_invalid_utf8(!self.only_utf8)
247                  .nest_limit(self.options.nest_limit)
248                  .build();
249              let expr =
250                  parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251              bytes = bytes || !expr.is_always_utf8();
252  
253              if cfg!(feature = "perf-literal") {
254                  if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255                      // Partial anchors unfortunately make it hard to use
256                      // prefixes, so disable them.
257                      prefixes = None;
258                  } else if is_set && expr.is_anchored_start() {
259                      // Regex sets with anchors do not go well with literal
260                      // optimizations.
261                      prefixes = None;
262                  }
263                  prefixes = prefixes.and_then(|mut prefixes| {
264                      if !prefixes.union_prefixes(&expr) {
265                          None
266                      } else {
267                          Some(prefixes)
268                      }
269                  });
270  
271                  if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272                      // Partial anchors unfortunately make it hard to use
273                      // suffixes, so disable them.
274                      suffixes = None;
275                  } else if is_set && expr.is_anchored_end() {
276                      // Regex sets with anchors do not go well with literal
277                      // optimizations.
278                      suffixes = None;
279                  }
280                  suffixes = suffixes.and_then(|mut suffixes| {
281                      if !suffixes.union_suffixes(&expr) {
282                          None
283                      } else {
284                          Some(suffixes)
285                      }
286                  });
287              }
288              exprs.push(expr);
289          }
290          Ok(Parsed {
291              exprs: exprs,
292              prefixes: prefixes.unwrap_or_else(Literals::empty),
293              suffixes: suffixes.unwrap_or_else(Literals::empty),
294              bytes: bytes,
295          })
296      }
297  
298      /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>299      pub fn build(self) -> Result<Exec, Error> {
300          // Special case when we have no patterns to compile.
301          // This can happen when compiling a regex set.
302          if self.options.pats.is_empty() {
303              let ro = Arc::new(ExecReadOnly {
304                  res: vec![],
305                  nfa: Program::new(),
306                  dfa: Program::new(),
307                  dfa_reverse: Program::new(),
308                  suffixes: LiteralSearcher::empty(),
309                  #[cfg(feature = "perf-literal")]
310                  ac: None,
311                  match_type: MatchType::Nothing,
312              });
313              let pool = ExecReadOnly::new_pool(&ro);
314              return Ok(Exec { ro: ro, pool });
315          }
316          let parsed = self.parse()?;
317          let mut nfa = Compiler::new()
318              .size_limit(self.options.size_limit)
319              .bytes(self.bytes || parsed.bytes)
320              .only_utf8(self.only_utf8)
321              .compile(&parsed.exprs)?;
322          let mut dfa = Compiler::new()
323              .size_limit(self.options.size_limit)
324              .dfa(true)
325              .only_utf8(self.only_utf8)
326              .compile(&parsed.exprs)?;
327          let mut dfa_reverse = Compiler::new()
328              .size_limit(self.options.size_limit)
329              .dfa(true)
330              .only_utf8(self.only_utf8)
331              .reverse(true)
332              .compile(&parsed.exprs)?;
333  
334          #[cfg(feature = "perf-literal")]
335          let ac = self.build_aho_corasick(&parsed);
336          nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337          dfa.prefixes = nfa.prefixes.clone();
338          dfa.dfa_size_limit = self.options.dfa_size_limit;
339          dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340  
341          let mut ro = ExecReadOnly {
342              res: self.options.pats,
343              nfa: nfa,
344              dfa: dfa,
345              dfa_reverse: dfa_reverse,
346              suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347              #[cfg(feature = "perf-literal")]
348              ac: ac,
349              match_type: MatchType::Nothing,
350          };
351          ro.match_type = ro.choose_match_type(self.match_type);
352  
353          let ro = Arc::new(ro);
354          let pool = ExecReadOnly::new_pool(&ro);
355          Ok(Exec { ro, pool })
356      }
357  
358      #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359      fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360          if parsed.exprs.len() != 1 {
361              return None;
362          }
363          let lits = match alternation_literals(&parsed.exprs[0]) {
364              None => return None,
365              Some(lits) => lits,
366          };
367          // If we have a small number of literals, then let Teddy handle
368          // things (see literal/mod.rs).
369          if lits.len() <= 32 {
370              return None;
371          }
372          Some(
373              AhoCorasickBuilder::new()
374                  .match_kind(MatchKind::LeftmostFirst)
375                  .auto_configure(&lits)
376                  // We always want this to reduce size, regardless
377                  // of what auto-configure does.
378                  .byte_classes(true)
379                  .build_with_size::<u32, _, _>(&lits)
380                  // This should never happen because we'd long exceed the
381                  // compilation limit for regexes first.
382                  .expect("AC automaton too big"),
383          )
384      }
385  }
386  
387  impl<'c> RegularExpression for ExecNoSyncStr<'c> {
388      type Text = str;
389  
slots_len(&self) -> usize390      fn slots_len(&self) -> usize {
391          self.0.slots_len()
392      }
393  
next_after_empty(&self, text: &str, i: usize) -> usize394      fn next_after_empty(&self, text: &str, i: usize) -> usize {
395          next_utf8(text.as_bytes(), i)
396      }
397  
398      #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>399      fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
400          self.0.shortest_match_at(text.as_bytes(), start)
401      }
402  
403      #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool404      fn is_match_at(&self, text: &str, start: usize) -> bool {
405          self.0.is_match_at(text.as_bytes(), start)
406      }
407  
408      #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>409      fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
410          self.0.find_at(text.as_bytes(), start)
411      }
412  
413      #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>414      fn captures_read_at(
415          &self,
416          locs: &mut Locations,
417          text: &str,
418          start: usize,
419      ) -> Option<(usize, usize)> {
420          self.0.captures_read_at(locs, text.as_bytes(), start)
421      }
422  }
423  
424  impl<'c> RegularExpression for ExecNoSync<'c> {
425      type Text = [u8];
426  
427      /// Returns the number of capture slots in the regular expression. (There
428      /// are two slots for every capture group, corresponding to possibly empty
429      /// start and end locations of the capture.)
slots_len(&self) -> usize430      fn slots_len(&self) -> usize {
431          self.ro.nfa.captures.len() * 2
432      }
433  
next_after_empty(&self, _text: &[u8], i: usize) -> usize434      fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
435          i + 1
436      }
437  
438      /// Returns the end of a match location, possibly occurring before the
439      /// end location of the correct leftmost-first match.
440      #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>441      fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
442          if !self.is_anchor_end_match(text) {
443              return None;
444          }
445          match self.ro.match_type {
446              #[cfg(feature = "perf-literal")]
447              MatchType::Literal(ty) => {
448                  self.find_literals(ty, text, start).map(|(_, e)| e)
449              }
450              #[cfg(feature = "perf-dfa")]
451              MatchType::Dfa | MatchType::DfaMany => {
452                  match self.shortest_dfa(text, start) {
453                      dfa::Result::Match(end) => Some(end),
454                      dfa::Result::NoMatch(_) => None,
455                      dfa::Result::Quit => self.shortest_nfa(text, start),
456                  }
457              }
458              #[cfg(feature = "perf-dfa")]
459              MatchType::DfaAnchoredReverse => {
460                  match dfa::Fsm::reverse(
461                      &self.ro.dfa_reverse,
462                      self.cache.value(),
463                      true,
464                      &text[start..],
465                      text.len(),
466                  ) {
467                      dfa::Result::Match(_) => Some(text.len()),
468                      dfa::Result::NoMatch(_) => None,
469                      dfa::Result::Quit => self.shortest_nfa(text, start),
470                  }
471              }
472              #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
473              MatchType::DfaSuffix => {
474                  match self.shortest_dfa_reverse_suffix(text, start) {
475                      dfa::Result::Match(e) => Some(e),
476                      dfa::Result::NoMatch(_) => None,
477                      dfa::Result::Quit => self.shortest_nfa(text, start),
478                  }
479              }
480              MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
481              MatchType::Nothing => None,
482          }
483      }
484  
485      /// Returns true if and only if the regex matches text.
486      ///
487      /// For single regular expressions, this is equivalent to calling
488      /// shortest_match(...).is_some().
489      #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool490      fn is_match_at(&self, text: &[u8], start: usize) -> bool {
491          if !self.is_anchor_end_match(text) {
492              return false;
493          }
494          // We need to do this dance because shortest_match relies on the NFA
495          // filling in captures[1], but a RegexSet has no captures. In other
496          // words, a RegexSet can't (currently) use shortest_match. ---AG
497          match self.ro.match_type {
498              #[cfg(feature = "perf-literal")]
499              MatchType::Literal(ty) => {
500                  self.find_literals(ty, text, start).is_some()
501              }
502              #[cfg(feature = "perf-dfa")]
503              MatchType::Dfa | MatchType::DfaMany => {
504                  match self.shortest_dfa(text, start) {
505                      dfa::Result::Match(_) => true,
506                      dfa::Result::NoMatch(_) => false,
507                      dfa::Result::Quit => self.match_nfa(text, start),
508                  }
509              }
510              #[cfg(feature = "perf-dfa")]
511              MatchType::DfaAnchoredReverse => {
512                  match dfa::Fsm::reverse(
513                      &self.ro.dfa_reverse,
514                      self.cache.value(),
515                      true,
516                      &text[start..],
517                      text.len(),
518                  ) {
519                      dfa::Result::Match(_) => true,
520                      dfa::Result::NoMatch(_) => false,
521                      dfa::Result::Quit => self.match_nfa(text, start),
522                  }
523              }
524              #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
525              MatchType::DfaSuffix => {
526                  match self.shortest_dfa_reverse_suffix(text, start) {
527                      dfa::Result::Match(_) => true,
528                      dfa::Result::NoMatch(_) => false,
529                      dfa::Result::Quit => self.match_nfa(text, start),
530                  }
531              }
532              MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
533              MatchType::Nothing => false,
534          }
535      }
536  
537      /// Finds the start and end location of the leftmost-first match, starting
538      /// at the given location.
539      #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>540      fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
541          if !self.is_anchor_end_match(text) {
542              return None;
543          }
544          match self.ro.match_type {
545              #[cfg(feature = "perf-literal")]
546              MatchType::Literal(ty) => self.find_literals(ty, text, start),
547              #[cfg(feature = "perf-dfa")]
548              MatchType::Dfa => match self.find_dfa_forward(text, start) {
549                  dfa::Result::Match((s, e)) => Some((s, e)),
550                  dfa::Result::NoMatch(_) => None,
551                  dfa::Result::Quit => {
552                      self.find_nfa(MatchNfaType::Auto, text, start)
553                  }
554              },
555              #[cfg(feature = "perf-dfa")]
556              MatchType::DfaAnchoredReverse => {
557                  match self.find_dfa_anchored_reverse(text, start) {
558                      dfa::Result::Match((s, e)) => Some((s, e)),
559                      dfa::Result::NoMatch(_) => None,
560                      dfa::Result::Quit => {
561                          self.find_nfa(MatchNfaType::Auto, text, start)
562                      }
563                  }
564              }
565              #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
566              MatchType::DfaSuffix => {
567                  match self.find_dfa_reverse_suffix(text, start) {
568                      dfa::Result::Match((s, e)) => Some((s, e)),
569                      dfa::Result::NoMatch(_) => None,
570                      dfa::Result::Quit => {
571                          self.find_nfa(MatchNfaType::Auto, text, start)
572                      }
573                  }
574              }
575              MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
576              MatchType::Nothing => None,
577              #[cfg(feature = "perf-dfa")]
578              MatchType::DfaMany => {
579                  unreachable!("BUG: RegexSet cannot be used with find")
580              }
581          }
582      }
583  
584      /// Finds the start and end location of the leftmost-first match and also
585      /// fills in all matching capture groups.
586      ///
587      /// The number of capture slots given should be equal to the total number
588      /// of capture slots in the compiled program.
589      ///
590      /// Note that the first two slots always correspond to the start and end
591      /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>592      fn captures_read_at(
593          &self,
594          locs: &mut Locations,
595          text: &[u8],
596          start: usize,
597      ) -> Option<(usize, usize)> {
598          let slots = locs.as_slots();
599          for slot in slots.iter_mut() {
600              *slot = None;
601          }
602          // If the caller unnecessarily uses this, then we try to save them
603          // from themselves.
604          match slots.len() {
605              0 => return self.find_at(text, start),
606              2 => {
607                  return self.find_at(text, start).map(|(s, e)| {
608                      slots[0] = Some(s);
609                      slots[1] = Some(e);
610                      (s, e)
611                  });
612              }
613              _ => {} // fallthrough
614          }
615          if !self.is_anchor_end_match(text) {
616              return None;
617          }
618          match self.ro.match_type {
619              #[cfg(feature = "perf-literal")]
620              MatchType::Literal(ty) => {
621                  self.find_literals(ty, text, start).and_then(|(s, e)| {
622                      self.captures_nfa_type(
623                          MatchNfaType::Auto,
624                          slots,
625                          text,
626                          s,
627                          e,
628                      )
629                  })
630              }
631              #[cfg(feature = "perf-dfa")]
632              MatchType::Dfa => {
633                  if self.ro.nfa.is_anchored_start {
634                      self.captures_nfa(slots, text, start)
635                  } else {
636                      match self.find_dfa_forward(text, start) {
637                          dfa::Result::Match((s, e)) => self.captures_nfa_type(
638                              MatchNfaType::Auto,
639                              slots,
640                              text,
641                              s,
642                              e,
643                          ),
644                          dfa::Result::NoMatch(_) => None,
645                          dfa::Result::Quit => {
646                              self.captures_nfa(slots, text, start)
647                          }
648                      }
649                  }
650              }
651              #[cfg(feature = "perf-dfa")]
652              MatchType::DfaAnchoredReverse => {
653                  match self.find_dfa_anchored_reverse(text, start) {
654                      dfa::Result::Match((s, e)) => self.captures_nfa_type(
655                          MatchNfaType::Auto,
656                          slots,
657                          text,
658                          s,
659                          e,
660                      ),
661                      dfa::Result::NoMatch(_) => None,
662                      dfa::Result::Quit => self.captures_nfa(slots, text, start),
663                  }
664              }
665              #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
666              MatchType::DfaSuffix => {
667                  match self.find_dfa_reverse_suffix(text, start) {
668                      dfa::Result::Match((s, e)) => self.captures_nfa_type(
669                          MatchNfaType::Auto,
670                          slots,
671                          text,
672                          s,
673                          e,
674                      ),
675                      dfa::Result::NoMatch(_) => None,
676                      dfa::Result::Quit => self.captures_nfa(slots, text, start),
677                  }
678              }
679              MatchType::Nfa(ty) => {
680                  self.captures_nfa_type(ty, slots, text, start, text.len())
681              }
682              MatchType::Nothing => None,
683              #[cfg(feature = "perf-dfa")]
684              MatchType::DfaMany => {
685                  unreachable!("BUG: RegexSet cannot be used with captures")
686              }
687          }
688      }
689  }
690  
691  impl<'c> ExecNoSync<'c> {
692      /// Finds the leftmost-first match using only literal search.
693      #[cfg(feature = "perf-literal")]
694      #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>695      fn find_literals(
696          &self,
697          ty: MatchLiteralType,
698          text: &[u8],
699          start: usize,
700      ) -> Option<(usize, usize)> {
701          use self::MatchLiteralType::*;
702          match ty {
703              Unanchored => {
704                  let lits = &self.ro.nfa.prefixes;
705                  lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
706              }
707              AnchoredStart => {
708                  let lits = &self.ro.nfa.prefixes;
709                  if start == 0 || !self.ro.nfa.is_anchored_start {
710                      lits.find_start(&text[start..])
711                          .map(|(s, e)| (start + s, start + e))
712                  } else {
713                      None
714                  }
715              }
716              AnchoredEnd => {
717                  let lits = &self.ro.suffixes;
718                  lits.find_end(&text[start..])
719                      .map(|(s, e)| (start + s, start + e))
720              }
721              AhoCorasick => self
722                  .ro
723                  .ac
724                  .as_ref()
725                  .unwrap()
726                  .find(&text[start..])
727                  .map(|m| (start + m.start(), start + m.end())),
728          }
729      }
730  
731      /// Finds the leftmost-first match (start and end) using only the DFA.
732      ///
733      /// If the result returned indicates that the DFA quit, then another
734      /// matching engine should be used.
735      #[cfg(feature = "perf-dfa")]
736      #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>737      fn find_dfa_forward(
738          &self,
739          text: &[u8],
740          start: usize,
741      ) -> dfa::Result<(usize, usize)> {
742          use dfa::Result::*;
743          let end = match dfa::Fsm::forward(
744              &self.ro.dfa,
745              self.cache.value(),
746              false,
747              text,
748              start,
749          ) {
750              NoMatch(i) => return NoMatch(i),
751              Quit => return Quit,
752              Match(end) if start == end => return Match((start, start)),
753              Match(end) => end,
754          };
755          // Now run the DFA in reverse to find the start of the match.
756          match dfa::Fsm::reverse(
757              &self.ro.dfa_reverse,
758              self.cache.value(),
759              false,
760              &text[start..],
761              end - start,
762          ) {
763              Match(s) => Match((start + s, end)),
764              NoMatch(i) => NoMatch(i),
765              Quit => Quit,
766          }
767      }
768  
769      /// Finds the leftmost-first match (start and end) using only the DFA,
770      /// but assumes the regex is anchored at the end and therefore starts at
771      /// the end of the regex and matches in reverse.
772      ///
773      /// If the result returned indicates that the DFA quit, then another
774      /// matching engine should be used.
775      #[cfg(feature = "perf-dfa")]
776      #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>777      fn find_dfa_anchored_reverse(
778          &self,
779          text: &[u8],
780          start: usize,
781      ) -> dfa::Result<(usize, usize)> {
782          use dfa::Result::*;
783          match dfa::Fsm::reverse(
784              &self.ro.dfa_reverse,
785              self.cache.value(),
786              false,
787              &text[start..],
788              text.len() - start,
789          ) {
790              Match(s) => Match((start + s, text.len())),
791              NoMatch(i) => NoMatch(i),
792              Quit => Quit,
793          }
794      }
795  
796      /// Finds the end of the shortest match using only the DFA.
797      #[cfg(feature = "perf-dfa")]
798      #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>799      fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
800          dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
801      }
802  
803      /// Finds the end of the shortest match using only the DFA by scanning for
804      /// suffix literals.
805      #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
806      #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>807      fn shortest_dfa_reverse_suffix(
808          &self,
809          text: &[u8],
810          start: usize,
811      ) -> dfa::Result<usize> {
812          match self.exec_dfa_reverse_suffix(text, start) {
813              None => self.shortest_dfa(text, start),
814              Some(r) => r.map(|(_, end)| end),
815          }
816      }
817  
818      /// Finds the end of the shortest match using only the DFA by scanning for
819      /// suffix literals. It also reports the start of the match.
820      ///
821      /// Note that if None is returned, then the optimization gave up to avoid
822      /// worst case quadratic behavior. A forward scanning DFA should be tried
823      /// next.
824      ///
825      /// If a match is returned and the full leftmost-first match is desired,
826      /// then a forward scan starting from the beginning of the match must be
827      /// done.
828      ///
829      /// If the result returned indicates that the DFA quit, then another
830      /// matching engine should be used.
831      #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
832      #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>833      fn exec_dfa_reverse_suffix(
834          &self,
835          text: &[u8],
836          original_start: usize,
837      ) -> Option<dfa::Result<(usize, usize)>> {
838          use dfa::Result::*;
839  
840          let lcs = self.ro.suffixes.lcs();
841          debug_assert!(lcs.len() >= 1);
842          let mut start = original_start;
843          let mut end = start;
844          let mut last_literal = start;
845          while end <= text.len() {
846              last_literal += match lcs.find(&text[last_literal..]) {
847                  None => return Some(NoMatch(text.len())),
848                  Some(i) => i,
849              };
850              end = last_literal + lcs.len();
851              match dfa::Fsm::reverse(
852                  &self.ro.dfa_reverse,
853                  self.cache.value(),
854                  false,
855                  &text[start..end],
856                  end - start,
857              ) {
858                  Match(0) | NoMatch(0) => return None,
859                  Match(i) => return Some(Match((start + i, end))),
860                  NoMatch(i) => {
861                      start += i;
862                      last_literal += 1;
863                      continue;
864                  }
865                  Quit => return Some(Quit),
866              };
867          }
868          Some(NoMatch(text.len()))
869      }
870  
871      /// Finds the leftmost-first match (start and end) using only the DFA
872      /// by scanning for suffix literals.
873      ///
874      /// If the result returned indicates that the DFA quit, then another
875      /// matching engine should be used.
876      #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
877      #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>878      fn find_dfa_reverse_suffix(
879          &self,
880          text: &[u8],
881          start: usize,
882      ) -> dfa::Result<(usize, usize)> {
883          use dfa::Result::*;
884  
885          let match_start = match self.exec_dfa_reverse_suffix(text, start) {
886              None => return self.find_dfa_forward(text, start),
887              Some(Match((start, _))) => start,
888              Some(r) => return r,
889          };
890          // At this point, we've found a match. The only way to quit now
891          // without a match is if the DFA gives up (seems unlikely).
892          //
893          // Now run the DFA forwards to find the proper end of the match.
894          // (The suffix literal match can only indicate the earliest
895          // possible end location, which may appear before the end of the
896          // leftmost-first match.)
897          match dfa::Fsm::forward(
898              &self.ro.dfa,
899              self.cache.value(),
900              false,
901              text,
902              match_start,
903          ) {
904              NoMatch(_) => panic!("BUG: reverse match implies forward match"),
905              Quit => Quit,
906              Match(e) => Match((match_start, e)),
907          }
908      }
909  
910      /// Executes the NFA engine to return whether there is a match or not.
911      ///
912      /// Ideally, we could use shortest_nfa(...).is_some() and get the same
913      /// performance characteristics, but regex sets don't have captures, which
914      /// shortest_nfa depends on.
915      #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool916      fn match_nfa(&self, text: &[u8], start: usize) -> bool {
917          self.match_nfa_type(MatchNfaType::Auto, text, start)
918      }
919  
920      /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool921      fn match_nfa_type(
922          &self,
923          ty: MatchNfaType,
924          text: &[u8],
925          start: usize,
926      ) -> bool {
927          self.exec_nfa(
928              ty,
929              &mut [false],
930              &mut [],
931              true,
932              false,
933              text,
934              start,
935              text.len(),
936          )
937      }
938  
939      /// Finds the shortest match using an NFA.
940      #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>941      fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
942          self.shortest_nfa_type(MatchNfaType::Auto, text, start)
943      }
944  
945      /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>946      fn shortest_nfa_type(
947          &self,
948          ty: MatchNfaType,
949          text: &[u8],
950          start: usize,
951      ) -> Option<usize> {
952          let mut slots = [None, None];
953          if self.exec_nfa(
954              ty,
955              &mut [false],
956              &mut slots,
957              true,
958              true,
959              text,
960              start,
961              text.len(),
962          ) {
963              slots[1]
964          } else {
965              None
966          }
967      }
968  
969      /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>970      fn find_nfa(
971          &self,
972          ty: MatchNfaType,
973          text: &[u8],
974          start: usize,
975      ) -> Option<(usize, usize)> {
976          let mut slots = [None, None];
977          if self.exec_nfa(
978              ty,
979              &mut [false],
980              &mut slots,
981              false,
982              false,
983              text,
984              start,
985              text.len(),
986          ) {
987              match (slots[0], slots[1]) {
988                  (Some(s), Some(e)) => Some((s, e)),
989                  _ => None,
990              }
991          } else {
992              None
993          }
994      }
995  
996      /// Like find_nfa, but fills in captures.
997      ///
998      /// `slots` should have length equal to `2 * nfa.captures.len()`.
999      #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>1000      fn captures_nfa(
1001          &self,
1002          slots: &mut [Slot],
1003          text: &[u8],
1004          start: usize,
1005      ) -> Option<(usize, usize)> {
1006          self.captures_nfa_type(
1007              MatchNfaType::Auto,
1008              slots,
1009              text,
1010              start,
1011              text.len(),
1012          )
1013      }
1014  
1015      /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1016      fn captures_nfa_type(
1017          &self,
1018          ty: MatchNfaType,
1019          slots: &mut [Slot],
1020          text: &[u8],
1021          start: usize,
1022          end: usize,
1023      ) -> Option<(usize, usize)> {
1024          if self.exec_nfa(
1025              ty,
1026              &mut [false],
1027              slots,
1028              false,
1029              false,
1030              text,
1031              start,
1032              end,
1033          ) {
1034              match (slots[0], slots[1]) {
1035                  (Some(s), Some(e)) => Some((s, e)),
1036                  _ => None,
1037              }
1038          } else {
1039              None
1040          }
1041      }
1042  
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1043      fn exec_nfa(
1044          &self,
1045          mut ty: MatchNfaType,
1046          matches: &mut [bool],
1047          slots: &mut [Slot],
1048          quit_after_match: bool,
1049          quit_after_match_with_pos: bool,
1050          text: &[u8],
1051          start: usize,
1052          end: usize,
1053      ) -> bool {
1054          use self::MatchNfaType::*;
1055          if let Auto = ty {
1056              if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1057                  ty = Backtrack;
1058              } else {
1059                  ty = PikeVM;
1060              }
1061          }
1062          // The backtracker can't return the shortest match position as it is
1063          // implemented today. So if someone calls `shortest_match` and we need
1064          // to run an NFA, then use the PikeVM.
1065          if quit_after_match_with_pos || ty == PikeVM {
1066              self.exec_pikevm(
1067                  matches,
1068                  slots,
1069                  quit_after_match,
1070                  text,
1071                  start,
1072                  end,
1073              )
1074          } else {
1075              self.exec_backtrack(matches, slots, text, start, end)
1076          }
1077      }
1078  
1079      /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1080      fn exec_pikevm(
1081          &self,
1082          matches: &mut [bool],
1083          slots: &mut [Slot],
1084          quit_after_match: bool,
1085          text: &[u8],
1086          start: usize,
1087          end: usize,
1088      ) -> bool {
1089          if self.ro.nfa.uses_bytes() {
1090              pikevm::Fsm::exec(
1091                  &self.ro.nfa,
1092                  self.cache.value(),
1093                  matches,
1094                  slots,
1095                  quit_after_match,
1096                  ByteInput::new(text, self.ro.nfa.only_utf8),
1097                  start,
1098                  end,
1099              )
1100          } else {
1101              pikevm::Fsm::exec(
1102                  &self.ro.nfa,
1103                  self.cache.value(),
1104                  matches,
1105                  slots,
1106                  quit_after_match,
1107                  CharInput::new(text),
1108                  start,
1109                  end,
1110              )
1111          }
1112      }
1113  
1114      /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1115      fn exec_backtrack(
1116          &self,
1117          matches: &mut [bool],
1118          slots: &mut [Slot],
1119          text: &[u8],
1120          start: usize,
1121          end: usize,
1122      ) -> bool {
1123          if self.ro.nfa.uses_bytes() {
1124              backtrack::Bounded::exec(
1125                  &self.ro.nfa,
1126                  self.cache.value(),
1127                  matches,
1128                  slots,
1129                  ByteInput::new(text, self.ro.nfa.only_utf8),
1130                  start,
1131                  end,
1132              )
1133          } else {
1134              backtrack::Bounded::exec(
1135                  &self.ro.nfa,
1136                  self.cache.value(),
1137                  matches,
1138                  slots,
1139                  CharInput::new(text),
1140                  start,
1141                  end,
1142              )
1143          }
1144      }
1145  
1146      /// Finds which regular expressions match the given text.
1147      ///
1148      /// `matches` should have length equal to the number of regexes being
1149      /// searched.
1150      ///
1151      /// This is only useful when one wants to know which regexes in a set
1152      /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1153      pub fn many_matches_at(
1154          &self,
1155          matches: &mut [bool],
1156          text: &[u8],
1157          start: usize,
1158      ) -> bool {
1159          use self::MatchType::*;
1160          if !self.is_anchor_end_match(text) {
1161              return false;
1162          }
1163          match self.ro.match_type {
1164              #[cfg(feature = "perf-literal")]
1165              Literal(ty) => {
1166                  debug_assert_eq!(matches.len(), 1);
1167                  matches[0] = self.find_literals(ty, text, start).is_some();
1168                  matches[0]
1169              }
1170              #[cfg(feature = "perf-dfa")]
1171              Dfa | DfaAnchoredReverse | DfaMany => {
1172                  match dfa::Fsm::forward_many(
1173                      &self.ro.dfa,
1174                      self.cache.value(),
1175                      matches,
1176                      text,
1177                      start,
1178                  ) {
1179                      dfa::Result::Match(_) => true,
1180                      dfa::Result::NoMatch(_) => false,
1181                      dfa::Result::Quit => self.exec_nfa(
1182                          MatchNfaType::Auto,
1183                          matches,
1184                          &mut [],
1185                          false,
1186                          false,
1187                          text,
1188                          start,
1189                          text.len(),
1190                      ),
1191                  }
1192              }
1193              #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1194              DfaSuffix => {
1195                  match dfa::Fsm::forward_many(
1196                      &self.ro.dfa,
1197                      self.cache.value(),
1198                      matches,
1199                      text,
1200                      start,
1201                  ) {
1202                      dfa::Result::Match(_) => true,
1203                      dfa::Result::NoMatch(_) => false,
1204                      dfa::Result::Quit => self.exec_nfa(
1205                          MatchNfaType::Auto,
1206                          matches,
1207                          &mut [],
1208                          false,
1209                          false,
1210                          text,
1211                          start,
1212                          text.len(),
1213                      ),
1214                  }
1215              }
1216              Nfa(ty) => self.exec_nfa(
1217                  ty,
1218                  matches,
1219                  &mut [],
1220                  false,
1221                  false,
1222                  text,
1223                  start,
1224                  text.len(),
1225              ),
1226              Nothing => false,
1227          }
1228      }
1229  
1230      #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1231      fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1232          #[cfg(not(feature = "perf-literal"))]
1233          fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1234              true
1235          }
1236  
1237          #[cfg(feature = "perf-literal")]
1238          fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1239              // Only do this check if the haystack is big (>1MB).
1240              if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1241                  let lcs = ro.suffixes.lcs();
1242                  if lcs.len() >= 1 && !lcs.is_suffix(text) {
1243                      return false;
1244                  }
1245              }
1246              true
1247          }
1248  
1249          imp(&self.ro, text)
1250      }
1251  
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1252      pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1253          &self.ro.nfa.capture_name_idx
1254      }
1255  }
1256  
1257  impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1258      pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1259          self.0.capture_name_idx()
1260      }
1261  }
1262  
1263  impl Exec {
1264      /// Get a searcher that isn't Sync.
1265      #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1266      pub fn searcher(&self) -> ExecNoSync {
1267          ExecNoSync {
1268              ro: &self.ro, // a clone is too expensive here! (and not needed)
1269              cache: self.pool.get(),
1270          }
1271      }
1272  
1273      /// Get a searcher that isn't Sync and can match on &str.
1274      #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1275      pub fn searcher_str(&self) -> ExecNoSyncStr {
1276          ExecNoSyncStr(self.searcher())
1277      }
1278  
1279      /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1280      pub fn into_regex(self) -> re_unicode::Regex {
1281          re_unicode::Regex::from(self)
1282      }
1283  
1284      /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1285      pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1286          re_set::unicode::RegexSet::from(self)
1287      }
1288  
1289      /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1290      pub fn into_byte_regex(self) -> re_bytes::Regex {
1291          re_bytes::Regex::from(self)
1292      }
1293  
1294      /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1295      pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1296          re_set::bytes::RegexSet::from(self)
1297      }
1298  
1299      /// The original regular expressions given by the caller that were
1300      /// compiled.
regex_strings(&self) -> &[String]1301      pub fn regex_strings(&self) -> &[String] {
1302          &self.ro.res
1303      }
1304  
1305      /// Return a slice of capture names.
1306      ///
1307      /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1308      pub fn capture_names(&self) -> &[Option<String>] {
1309          &self.ro.nfa.captures
1310      }
1311  
1312      /// Return a reference to named groups mapping (from group name to
1313      /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1314      pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1315          &self.ro.nfa.capture_name_idx
1316      }
1317  }
1318  
1319  impl Clone for Exec {
clone(&self) -> Exec1320      fn clone(&self) -> Exec {
1321          let pool = ExecReadOnly::new_pool(&self.ro);
1322          Exec { ro: self.ro.clone(), pool }
1323      }
1324  }
1325  
1326  impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1327      fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1328          if let Some(MatchType::Nfa(_)) = hint {
1329              return hint.unwrap();
1330          }
1331          // If the NFA is empty, then we'll never match anything.
1332          if self.nfa.insts.is_empty() {
1333              return MatchType::Nothing;
1334          }
1335          if let Some(literalty) = self.choose_literal_match_type() {
1336              return literalty;
1337          }
1338          if let Some(dfaty) = self.choose_dfa_match_type() {
1339              return dfaty;
1340          }
1341          // We're so totally hosed.
1342          MatchType::Nfa(MatchNfaType::Auto)
1343      }
1344  
1345      /// If a plain literal scan can be used, then a corresponding literal
1346      /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1347      fn choose_literal_match_type(&self) -> Option<MatchType> {
1348          #[cfg(not(feature = "perf-literal"))]
1349          fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1350              None
1351          }
1352  
1353          #[cfg(feature = "perf-literal")]
1354          fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1355              // If our set of prefixes is complete, then we can use it to find
1356              // a match in lieu of a regex engine. This doesn't quite work well
1357              // in the presence of multiple regexes, so only do it when there's
1358              // one.
1359              //
1360              // TODO(burntsushi): Also, don't try to match literals if the regex
1361              // is partially anchored. We could technically do it, but we'd need
1362              // to create two sets of literals: all of them and then the subset
1363              // that aren't anchored. We would then only search for all of them
1364              // when at the beginning of the input and use the subset in all
1365              // other cases.
1366              if ro.res.len() != 1 {
1367                  return None;
1368              }
1369              if ro.ac.is_some() {
1370                  return Some(MatchType::Literal(
1371                      MatchLiteralType::AhoCorasick,
1372                  ));
1373              }
1374              if ro.nfa.prefixes.complete() {
1375                  return if ro.nfa.is_anchored_start {
1376                      Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1377                  } else {
1378                      Some(MatchType::Literal(MatchLiteralType::Unanchored))
1379                  };
1380              }
1381              if ro.suffixes.complete() {
1382                  return if ro.nfa.is_anchored_end {
1383                      Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1384                  } else {
1385                      // This case shouldn't happen. When the regex isn't
1386                      // anchored, then complete prefixes should imply complete
1387                      // suffixes.
1388                      Some(MatchType::Literal(MatchLiteralType::Unanchored))
1389                  };
1390              }
1391              None
1392          }
1393  
1394          imp(self)
1395      }
1396  
1397      /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1398      fn choose_dfa_match_type(&self) -> Option<MatchType> {
1399          #[cfg(not(feature = "perf-dfa"))]
1400          fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1401              None
1402          }
1403  
1404          #[cfg(feature = "perf-dfa")]
1405          fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1406              if !dfa::can_exec(&ro.dfa) {
1407                  return None;
1408              }
1409              // Regex sets require a slightly specialized path.
1410              if ro.res.len() >= 2 {
1411                  return Some(MatchType::DfaMany);
1412              }
1413              // If the regex is anchored at the end but not the start, then
1414              // just match in reverse from the end of the haystack.
1415              if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1416                  return Some(MatchType::DfaAnchoredReverse);
1417              }
1418              #[cfg(feature = "perf-literal")]
1419              {
1420                  // If there's a longish suffix literal, then it might be faster
1421                  // to look for that first.
1422                  if ro.should_suffix_scan() {
1423                      return Some(MatchType::DfaSuffix);
1424                  }
1425              }
1426              // Fall back to your garden variety forward searching lazy DFA.
1427              Some(MatchType::Dfa)
1428          }
1429  
1430          imp(self)
1431      }
1432  
1433      /// Returns true if the program is amenable to suffix scanning.
1434      ///
1435      /// When this is true, as a heuristic, we assume it is OK to quickly scan
1436      /// for suffix literals and then do a *reverse* DFA match from any matches
1437      /// produced by the literal scan. (And then followed by a forward DFA
1438      /// search, since the previously found suffix literal maybe not actually be
1439      /// the end of a match.)
1440      ///
1441      /// This is a bit of a specialized optimization, but can result in pretty
1442      /// big performance wins if 1) there are no prefix literals and 2) the
1443      /// suffix literals are pretty rare in the text. (1) is obviously easy to
1444      /// account for but (2) is harder. As a proxy, we assume that longer
1445      /// strings are generally rarer, so we only enable this optimization when
1446      /// we have a meaty suffix.
1447      #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1448      fn should_suffix_scan(&self) -> bool {
1449          if self.suffixes.is_empty() {
1450              return false;
1451          }
1452          let lcs_len = self.suffixes.lcs().char_len();
1453          lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1454      }
1455  
new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1456      fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1457          let ro = ro.clone();
1458          Box::new(Pool::new(Box::new(move || {
1459              AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1460          })))
1461      }
1462  }
1463  
1464  #[derive(Clone, Copy, Debug)]
1465  enum MatchType {
1466      /// A single or multiple literal search. This is only used when the regex
1467      /// can be decomposed into a literal search.
1468      #[cfg(feature = "perf-literal")]
1469      Literal(MatchLiteralType),
1470      /// A normal DFA search.
1471      #[cfg(feature = "perf-dfa")]
1472      Dfa,
1473      /// A reverse DFA search starting from the end of a haystack.
1474      #[cfg(feature = "perf-dfa")]
1475      DfaAnchoredReverse,
1476      /// A reverse DFA search with suffix literal scanning.
1477      #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1478      DfaSuffix,
1479      /// Use the DFA on two or more regular expressions.
1480      #[cfg(feature = "perf-dfa")]
1481      DfaMany,
1482      /// An NFA variant.
1483      Nfa(MatchNfaType),
1484      /// No match is ever possible, so don't ever try to search.
1485      Nothing,
1486  }
1487  
1488  #[derive(Clone, Copy, Debug)]
1489  #[cfg(feature = "perf-literal")]
1490  enum MatchLiteralType {
1491      /// Match literals anywhere in text.
1492      Unanchored,
1493      /// Match literals only at the start of text.
1494      AnchoredStart,
1495      /// Match literals only at the end of text.
1496      AnchoredEnd,
1497      /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1498      /// ExecReadOnly.
1499      AhoCorasick,
1500  }
1501  
1502  #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1503  enum MatchNfaType {
1504      /// Choose between Backtrack and PikeVM.
1505      Auto,
1506      /// NFA bounded backtracking.
1507      ///
1508      /// (This is only set by tests, since it never makes sense to always want
1509      /// backtracking.)
1510      Backtrack,
1511      /// The Pike VM.
1512      ///
1513      /// (This is only set by tests, since it never makes sense to always want
1514      /// the Pike VM.)
1515      PikeVM,
1516  }
1517  
1518  /// `ProgramCache` maintains reusable allocations for each matching engine
1519  /// available to a particular program.
1520  ///
1521  /// We declare this as unwind safe since it's a cache that's only used for
1522  /// performance purposes. If a panic occurs, it is (or should be) always safe
1523  /// to continue using the same regex object.
1524  pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1525  
1526  #[derive(Debug)]
1527  pub struct ProgramCacheInner {
1528      pub pikevm: pikevm::Cache,
1529      pub backtrack: backtrack::Cache,
1530      #[cfg(feature = "perf-dfa")]
1531      pub dfa: dfa::Cache,
1532      #[cfg(feature = "perf-dfa")]
1533      pub dfa_reverse: dfa::Cache,
1534  }
1535  
1536  impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1537      fn new(ro: &ExecReadOnly) -> Self {
1538          ProgramCacheInner {
1539              pikevm: pikevm::Cache::new(&ro.nfa),
1540              backtrack: backtrack::Cache::new(&ro.nfa),
1541              #[cfg(feature = "perf-dfa")]
1542              dfa: dfa::Cache::new(&ro.dfa),
1543              #[cfg(feature = "perf-dfa")]
1544              dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1545          }
1546      }
1547  }
1548  
1549  /// Alternation literals checks if the given HIR is a simple alternation of
1550  /// literals, and if so, returns them. Otherwise, this returns None.
1551  #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1552  fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1553      use syntax::hir::{HirKind, Literal};
1554  
1555      // This is pretty hacky, but basically, if `is_alternation_literal` is
1556      // true, then we can make several assumptions about the structure of our
1557      // HIR. This is what justifies the `unreachable!` statements below.
1558      //
1559      // This code should be refactored once we overhaul this crate's
1560      // optimization pipeline, because this is a terribly inflexible way to go
1561      // about things.
1562  
1563      if !expr.is_alternation_literal() {
1564          return None;
1565      }
1566      let alts = match *expr.kind() {
1567          HirKind::Alternation(ref alts) => alts,
1568          _ => return None, // one literal isn't worth it
1569      };
1570  
1571      let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1572          Literal::Unicode(c) => {
1573              let mut buf = [0; 4];
1574              dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1575          }
1576          Literal::Byte(b) => {
1577              dst.push(b);
1578          }
1579      };
1580  
1581      let mut lits = vec![];
1582      for alt in alts {
1583          let mut lit = vec![];
1584          match *alt.kind() {
1585              HirKind::Literal(ref x) => extendlit(x, &mut lit),
1586              HirKind::Concat(ref exprs) => {
1587                  for e in exprs {
1588                      match *e.kind() {
1589                          HirKind::Literal(ref x) => extendlit(x, &mut lit),
1590                          _ => unreachable!("expected literal, got {:?}", e),
1591                      }
1592                  }
1593              }
1594              _ => unreachable!("expected literal or concat, got {:?}", alt),
1595          }
1596          lits.push(lit);
1597      }
1598      Some(lits)
1599  }
1600  
1601  #[cfg(test)]
1602  mod test {
1603      #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1604      fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1605          use internal::ExecBuilder;
1606  
1607          let backtrack_bytes_re = ExecBuilder::new("^S")
1608              .bounded_backtracking()
1609              .only_utf8(false)
1610              .build()
1611              .map(|exec| exec.into_byte_regex())
1612              .map_err(|err| format!("{}", err))
1613              .unwrap();
1614  
1615          let default_bytes_re = ExecBuilder::new("^S")
1616              .only_utf8(false)
1617              .build()
1618              .map(|exec| exec.into_byte_regex())
1619              .map_err(|err| format!("{}", err))
1620              .unwrap();
1621  
1622          let input = vec![83, 83];
1623  
1624          let s1 = backtrack_bytes_re.split(&input);
1625          let s2 = default_bytes_re.split(&input);
1626          for (chunk1, chunk2) in s1.zip(s2) {
1627              assert_eq!(chunk1, chunk2);
1628          }
1629      }
1630  
1631      #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1632      fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1633          use internal::ExecBuilder;
1634  
1635          let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1636              .bounded_backtracking()
1637              .bytes(true)
1638              .build()
1639              .map(|exec| exec.into_regex())
1640              .map_err(|err| format!("{}", err))
1641              .unwrap();
1642  
1643          let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1644              .bytes(true)
1645              .build()
1646              .map(|exec| exec.into_regex())
1647              .map_err(|err| format!("{}", err))
1648              .unwrap();
1649  
1650          let input = "**";
1651  
1652          let s1 = backtrack_bytes_re.split(input);
1653          let s2 = default_bytes_re.split(input);
1654          for (chunk1, chunk2) in s1.zip(s2) {
1655              assert_eq!(chunk1, chunk2);
1656          }
1657      }
1658  }
1659