1 use std::cell::RefCell; 2 use std::collections::HashMap; 3 use std::panic::AssertUnwindSafe; 4 use std::sync::Arc; 5 6 #[cfg(feature = "perf-literal")] 7 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; 8 use syntax::hir::literal::Literals; 9 use syntax::hir::Hir; 10 use syntax::ParserBuilder; 11 12 use backtrack; 13 use compile::Compiler; 14 #[cfg(feature = "perf-dfa")] 15 use dfa; 16 use error::Error; 17 use input::{ByteInput, CharInput}; 18 use literal::LiteralSearcher; 19 use pikevm; 20 use pool::{Pool, PoolGuard}; 21 use prog::Program; 22 use re_builder::RegexOptions; 23 use re_bytes; 24 use re_set; 25 use re_trait::{Locations, RegularExpression, Slot}; 26 use re_unicode; 27 use utf8::next_utf8; 28 29 /// `Exec` manages the execution of a regular expression. 30 /// 31 /// In particular, this manages the various compiled forms of a single regular 32 /// expression and the choice of which matching engine to use to execute a 33 /// regular expression. 34 #[derive(Debug)] 35 pub struct Exec { 36 /// All read only state. 37 ro: Arc<ExecReadOnly>, 38 /// A pool of reusable values for the various matching engines. 39 /// 40 /// Note that boxing this value is not strictly necessary, but it is an 41 /// easy way to ensure that T does not bloat the stack sized used by a pool 42 /// in the case where T is big. And this turns out to be the case at the 43 /// time of writing for regex's use of this pool. At the time of writing, 44 /// the size of a Regex on the stack is 856 bytes. Boxing this value 45 /// reduces that size to 16 bytes. 46 pool: Box<Pool<ProgramCache>>, 47 } 48 49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This 50 /// means it is no longer Sync, but we can now avoid the overhead of 51 /// synchronization to fetch the cache. 52 #[derive(Debug)] 53 pub struct ExecNoSync<'c> { 54 /// All read only state. 55 ro: &'c Arc<ExecReadOnly>, 56 /// Caches for the various matching engines. 57 cache: PoolGuard<'c, ProgramCache>, 58 } 59 60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. 61 #[derive(Debug)] 62 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); 63 64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such 65 /// state is determined at compile time and never changes during search. 66 #[derive(Debug)] 67 struct ExecReadOnly { 68 /// The original regular expressions given by the caller to compile. 69 res: Vec<String>, 70 /// A compiled program that is used in the NFA simulation and backtracking. 71 /// It can be byte-based or Unicode codepoint based. 72 /// 73 /// N.B. It is not possibly to make this byte-based from the public API. 74 /// It is only used for testing byte based programs in the NFA simulations. 75 nfa: Program, 76 /// A compiled byte based program for DFA execution. This is only used 77 /// if a DFA can be executed. (Currently, only word boundary assertions are 78 /// not supported.) Note that this program contains an embedded `.*?` 79 /// preceding the first capture group, unless the regex is anchored at the 80 /// beginning. 81 dfa: Program, 82 /// The same as above, except the program is reversed (and there is no 83 /// preceding `.*?`). This is used by the DFA to find the starting location 84 /// of matches. 85 dfa_reverse: Program, 86 /// A set of suffix literals extracted from the regex. 87 /// 88 /// Prefix literals are stored on the `Program`, since they are used inside 89 /// the matching engines. 90 suffixes: LiteralSearcher, 91 /// An Aho-Corasick automaton with leftmost-first match semantics. 92 /// 93 /// This is only set when the entire regex is a simple unanchored 94 /// alternation of literals. We could probably use it more circumstances, 95 /// but this is already hacky enough in this architecture. 96 /// 97 /// N.B. We use u32 as a state ID representation under the assumption that 98 /// if we were to exhaust the ID space, we probably would have long 99 /// surpassed the compilation size limit. 100 #[cfg(feature = "perf-literal")] 101 ac: Option<AhoCorasick<u32>>, 102 /// match_type encodes as much upfront knowledge about how we're going to 103 /// execute a search as possible. 104 match_type: MatchType, 105 } 106 107 /// Facilitates the construction of an executor by exposing various knobs 108 /// to control how a regex is executed and what kinds of resources it's 109 /// permitted to use. 110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving 111 // `Debug`. 112 #[allow(missing_debug_implementations)] 113 pub struct ExecBuilder { 114 options: RegexOptions, 115 match_type: Option<MatchType>, 116 bytes: bool, 117 only_utf8: bool, 118 } 119 120 /// Parsed represents a set of parsed regular expressions and their detected 121 /// literals. 122 struct Parsed { 123 exprs: Vec<Hir>, 124 prefixes: Literals, 125 suffixes: Literals, 126 bytes: bool, 127 } 128 129 impl ExecBuilder { 130 /// Create a regex execution builder. 131 /// 132 /// This uses default settings for everything except the regex itself, 133 /// which must be provided. Further knobs can be set by calling methods, 134 /// and then finally, `build` to actually create the executor. new(re: &str) -> Self135 pub fn new(re: &str) -> Self { 136 Self::new_many(&[re]) 137 } 138 139 /// Like new, but compiles the union of the given regular expressions. 140 /// 141 /// Note that when compiling 2 or more regular expressions, capture groups 142 /// are completely unsupported. (This means both `find` and `captures` 143 /// wont work.) new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144 pub fn new_many<I, S>(res: I) -> Self 145 where 146 S: AsRef<str>, 147 I: IntoIterator<Item = S>, 148 { 149 let mut opts = RegexOptions::default(); 150 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); 151 Self::new_options(opts) 152 } 153 154 /// Create a regex execution builder. new_options(opts: RegexOptions) -> Self155 pub fn new_options(opts: RegexOptions) -> Self { 156 ExecBuilder { 157 options: opts, 158 match_type: None, 159 bytes: false, 160 only_utf8: true, 161 } 162 } 163 164 /// Set the matching engine to be automatically determined. 165 /// 166 /// This is the default state and will apply whatever optimizations are 167 /// possible, such as running a DFA. 168 /// 169 /// This overrides whatever was previously set via the `nfa` or 170 /// `bounded_backtracking` methods. automatic(mut self) -> Self171 pub fn automatic(mut self) -> Self { 172 self.match_type = None; 173 self 174 } 175 176 /// Sets the matching engine to use the NFA algorithm no matter what 177 /// optimizations are possible. 178 /// 179 /// This overrides whatever was previously set via the `automatic` or 180 /// `bounded_backtracking` methods. nfa(mut self) -> Self181 pub fn nfa(mut self) -> Self { 182 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); 183 self 184 } 185 186 /// Sets the matching engine to use a bounded backtracking engine no 187 /// matter what optimizations are possible. 188 /// 189 /// One must use this with care, since the bounded backtracking engine 190 /// uses memory proportion to `len(regex) * len(text)`. 191 /// 192 /// This overrides whatever was previously set via the `automatic` or 193 /// `nfa` methods. bounded_backtracking(mut self) -> Self194 pub fn bounded_backtracking(mut self) -> Self { 195 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); 196 self 197 } 198 199 /// Compiles byte based programs for use with the NFA matching engines. 200 /// 201 /// By default, the NFA engines match on Unicode scalar values. They can 202 /// be made to use byte based programs instead. In general, the byte based 203 /// programs are slower because of a less efficient encoding of character 204 /// classes. 205 /// 206 /// Note that this does not impact DFA matching engines, which always 207 /// execute on bytes. bytes(mut self, yes: bool) -> Self208 pub fn bytes(mut self, yes: bool) -> Self { 209 self.bytes = yes; 210 self 211 } 212 213 /// When disabled, the program compiled may match arbitrary bytes. 214 /// 215 /// When enabled (the default), all compiled programs exclusively match 216 /// valid UTF-8 bytes. only_utf8(mut self, yes: bool) -> Self217 pub fn only_utf8(mut self, yes: bool) -> Self { 218 self.only_utf8 = yes; 219 self 220 } 221 222 /// Set the Unicode flag. unicode(mut self, yes: bool) -> Self223 pub fn unicode(mut self, yes: bool) -> Self { 224 self.options.unicode = yes; 225 self 226 } 227 228 /// Parse the current set of patterns into their AST and extract literals. parse(&self) -> Result<Parsed, Error>229 fn parse(&self) -> Result<Parsed, Error> { 230 let mut exprs = Vec::with_capacity(self.options.pats.len()); 231 let mut prefixes = Some(Literals::empty()); 232 let mut suffixes = Some(Literals::empty()); 233 let mut bytes = false; 234 let is_set = self.options.pats.len() > 1; 235 // If we're compiling a regex set and that set has any anchored 236 // expressions, then disable all literal optimizations. 237 for pat in &self.options.pats { 238 let mut parser = ParserBuilder::new() 239 .octal(self.options.octal) 240 .case_insensitive(self.options.case_insensitive) 241 .multi_line(self.options.multi_line) 242 .dot_matches_new_line(self.options.dot_matches_new_line) 243 .swap_greed(self.options.swap_greed) 244 .ignore_whitespace(self.options.ignore_whitespace) 245 .unicode(self.options.unicode) 246 .allow_invalid_utf8(!self.only_utf8) 247 .nest_limit(self.options.nest_limit) 248 .build(); 249 let expr = 250 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; 251 bytes = bytes || !expr.is_always_utf8(); 252 253 if cfg!(feature = "perf-literal") { 254 if !expr.is_anchored_start() && expr.is_any_anchored_start() { 255 // Partial anchors unfortunately make it hard to use 256 // prefixes, so disable them. 257 prefixes = None; 258 } else if is_set && expr.is_anchored_start() { 259 // Regex sets with anchors do not go well with literal 260 // optimizations. 261 prefixes = None; 262 } 263 prefixes = prefixes.and_then(|mut prefixes| { 264 if !prefixes.union_prefixes(&expr) { 265 None 266 } else { 267 Some(prefixes) 268 } 269 }); 270 271 if !expr.is_anchored_end() && expr.is_any_anchored_end() { 272 // Partial anchors unfortunately make it hard to use 273 // suffixes, so disable them. 274 suffixes = None; 275 } else if is_set && expr.is_anchored_end() { 276 // Regex sets with anchors do not go well with literal 277 // optimizations. 278 suffixes = None; 279 } 280 suffixes = suffixes.and_then(|mut suffixes| { 281 if !suffixes.union_suffixes(&expr) { 282 None 283 } else { 284 Some(suffixes) 285 } 286 }); 287 } 288 exprs.push(expr); 289 } 290 Ok(Parsed { 291 exprs: exprs, 292 prefixes: prefixes.unwrap_or_else(Literals::empty), 293 suffixes: suffixes.unwrap_or_else(Literals::empty), 294 bytes: bytes, 295 }) 296 } 297 298 /// Build an executor that can run a regular expression. build(self) -> Result<Exec, Error>299 pub fn build(self) -> Result<Exec, Error> { 300 // Special case when we have no patterns to compile. 301 // This can happen when compiling a regex set. 302 if self.options.pats.is_empty() { 303 let ro = Arc::new(ExecReadOnly { 304 res: vec![], 305 nfa: Program::new(), 306 dfa: Program::new(), 307 dfa_reverse: Program::new(), 308 suffixes: LiteralSearcher::empty(), 309 #[cfg(feature = "perf-literal")] 310 ac: None, 311 match_type: MatchType::Nothing, 312 }); 313 let pool = ExecReadOnly::new_pool(&ro); 314 return Ok(Exec { ro: ro, pool }); 315 } 316 let parsed = self.parse()?; 317 let mut nfa = Compiler::new() 318 .size_limit(self.options.size_limit) 319 .bytes(self.bytes || parsed.bytes) 320 .only_utf8(self.only_utf8) 321 .compile(&parsed.exprs)?; 322 let mut dfa = Compiler::new() 323 .size_limit(self.options.size_limit) 324 .dfa(true) 325 .only_utf8(self.only_utf8) 326 .compile(&parsed.exprs)?; 327 let mut dfa_reverse = Compiler::new() 328 .size_limit(self.options.size_limit) 329 .dfa(true) 330 .only_utf8(self.only_utf8) 331 .reverse(true) 332 .compile(&parsed.exprs)?; 333 334 #[cfg(feature = "perf-literal")] 335 let ac = self.build_aho_corasick(&parsed); 336 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); 337 dfa.prefixes = nfa.prefixes.clone(); 338 dfa.dfa_size_limit = self.options.dfa_size_limit; 339 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; 340 341 let mut ro = ExecReadOnly { 342 res: self.options.pats, 343 nfa: nfa, 344 dfa: dfa, 345 dfa_reverse: dfa_reverse, 346 suffixes: LiteralSearcher::suffixes(parsed.suffixes), 347 #[cfg(feature = "perf-literal")] 348 ac: ac, 349 match_type: MatchType::Nothing, 350 }; 351 ro.match_type = ro.choose_match_type(self.match_type); 352 353 let ro = Arc::new(ro); 354 let pool = ExecReadOnly::new_pool(&ro); 355 Ok(Exec { ro, pool }) 356 } 357 358 #[cfg(feature = "perf-literal")] build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { 360 if parsed.exprs.len() != 1 { 361 return None; 362 } 363 let lits = match alternation_literals(&parsed.exprs[0]) { 364 None => return None, 365 Some(lits) => lits, 366 }; 367 // If we have a small number of literals, then let Teddy handle 368 // things (see literal/mod.rs). 369 if lits.len() <= 32 { 370 return None; 371 } 372 Some( 373 AhoCorasickBuilder::new() 374 .match_kind(MatchKind::LeftmostFirst) 375 .auto_configure(&lits) 376 // We always want this to reduce size, regardless 377 // of what auto-configure does. 378 .byte_classes(true) 379 .build_with_size::<u32, _, _>(&lits) 380 // This should never happen because we'd long exceed the 381 // compilation limit for regexes first. 382 .expect("AC automaton too big"), 383 ) 384 } 385 } 386 387 impl<'c> RegularExpression for ExecNoSyncStr<'c> { 388 type Text = str; 389 slots_len(&self) -> usize390 fn slots_len(&self) -> usize { 391 self.0.slots_len() 392 } 393 next_after_empty(&self, text: &str, i: usize) -> usize394 fn next_after_empty(&self, text: &str, i: usize) -> usize { 395 next_utf8(text.as_bytes(), i) 396 } 397 398 #[cfg_attr(feature = "perf-inline", inline(always))] shortest_match_at(&self, text: &str, start: usize) -> Option<usize>399 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { 400 self.0.shortest_match_at(text.as_bytes(), start) 401 } 402 403 #[cfg_attr(feature = "perf-inline", inline(always))] is_match_at(&self, text: &str, start: usize) -> bool404 fn is_match_at(&self, text: &str, start: usize) -> bool { 405 self.0.is_match_at(text.as_bytes(), start) 406 } 407 408 #[cfg_attr(feature = "perf-inline", inline(always))] find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>409 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 410 self.0.find_at(text.as_bytes(), start) 411 } 412 413 #[cfg_attr(feature = "perf-inline", inline(always))] captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>414 fn captures_read_at( 415 &self, 416 locs: &mut Locations, 417 text: &str, 418 start: usize, 419 ) -> Option<(usize, usize)> { 420 self.0.captures_read_at(locs, text.as_bytes(), start) 421 } 422 } 423 424 impl<'c> RegularExpression for ExecNoSync<'c> { 425 type Text = [u8]; 426 427 /// Returns the number of capture slots in the regular expression. (There 428 /// are two slots for every capture group, corresponding to possibly empty 429 /// start and end locations of the capture.) slots_len(&self) -> usize430 fn slots_len(&self) -> usize { 431 self.ro.nfa.captures.len() * 2 432 } 433 next_after_empty(&self, _text: &[u8], i: usize) -> usize434 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { 435 i + 1 436 } 437 438 /// Returns the end of a match location, possibly occurring before the 439 /// end location of the correct leftmost-first match. 440 #[cfg_attr(feature = "perf-inline", inline(always))] shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>441 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { 442 if !self.is_anchor_end_match(text) { 443 return None; 444 } 445 match self.ro.match_type { 446 #[cfg(feature = "perf-literal")] 447 MatchType::Literal(ty) => { 448 self.find_literals(ty, text, start).map(|(_, e)| e) 449 } 450 #[cfg(feature = "perf-dfa")] 451 MatchType::Dfa | MatchType::DfaMany => { 452 match self.shortest_dfa(text, start) { 453 dfa::Result::Match(end) => Some(end), 454 dfa::Result::NoMatch(_) => None, 455 dfa::Result::Quit => self.shortest_nfa(text, start), 456 } 457 } 458 #[cfg(feature = "perf-dfa")] 459 MatchType::DfaAnchoredReverse => { 460 match dfa::Fsm::reverse( 461 &self.ro.dfa_reverse, 462 self.cache.value(), 463 true, 464 &text[start..], 465 text.len(), 466 ) { 467 dfa::Result::Match(_) => Some(text.len()), 468 dfa::Result::NoMatch(_) => None, 469 dfa::Result::Quit => self.shortest_nfa(text, start), 470 } 471 } 472 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 473 MatchType::DfaSuffix => { 474 match self.shortest_dfa_reverse_suffix(text, start) { 475 dfa::Result::Match(e) => Some(e), 476 dfa::Result::NoMatch(_) => None, 477 dfa::Result::Quit => self.shortest_nfa(text, start), 478 } 479 } 480 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), 481 MatchType::Nothing => None, 482 } 483 } 484 485 /// Returns true if and only if the regex matches text. 486 /// 487 /// For single regular expressions, this is equivalent to calling 488 /// shortest_match(...).is_some(). 489 #[cfg_attr(feature = "perf-inline", inline(always))] is_match_at(&self, text: &[u8], start: usize) -> bool490 fn is_match_at(&self, text: &[u8], start: usize) -> bool { 491 if !self.is_anchor_end_match(text) { 492 return false; 493 } 494 // We need to do this dance because shortest_match relies on the NFA 495 // filling in captures[1], but a RegexSet has no captures. In other 496 // words, a RegexSet can't (currently) use shortest_match. ---AG 497 match self.ro.match_type { 498 #[cfg(feature = "perf-literal")] 499 MatchType::Literal(ty) => { 500 self.find_literals(ty, text, start).is_some() 501 } 502 #[cfg(feature = "perf-dfa")] 503 MatchType::Dfa | MatchType::DfaMany => { 504 match self.shortest_dfa(text, start) { 505 dfa::Result::Match(_) => true, 506 dfa::Result::NoMatch(_) => false, 507 dfa::Result::Quit => self.match_nfa(text, start), 508 } 509 } 510 #[cfg(feature = "perf-dfa")] 511 MatchType::DfaAnchoredReverse => { 512 match dfa::Fsm::reverse( 513 &self.ro.dfa_reverse, 514 self.cache.value(), 515 true, 516 &text[start..], 517 text.len(), 518 ) { 519 dfa::Result::Match(_) => true, 520 dfa::Result::NoMatch(_) => false, 521 dfa::Result::Quit => self.match_nfa(text, start), 522 } 523 } 524 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 525 MatchType::DfaSuffix => { 526 match self.shortest_dfa_reverse_suffix(text, start) { 527 dfa::Result::Match(_) => true, 528 dfa::Result::NoMatch(_) => false, 529 dfa::Result::Quit => self.match_nfa(text, start), 530 } 531 } 532 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), 533 MatchType::Nothing => false, 534 } 535 } 536 537 /// Finds the start and end location of the leftmost-first match, starting 538 /// at the given location. 539 #[cfg_attr(feature = "perf-inline", inline(always))] find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>540 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { 541 if !self.is_anchor_end_match(text) { 542 return None; 543 } 544 match self.ro.match_type { 545 #[cfg(feature = "perf-literal")] 546 MatchType::Literal(ty) => self.find_literals(ty, text, start), 547 #[cfg(feature = "perf-dfa")] 548 MatchType::Dfa => match self.find_dfa_forward(text, start) { 549 dfa::Result::Match((s, e)) => Some((s, e)), 550 dfa::Result::NoMatch(_) => None, 551 dfa::Result::Quit => { 552 self.find_nfa(MatchNfaType::Auto, text, start) 553 } 554 }, 555 #[cfg(feature = "perf-dfa")] 556 MatchType::DfaAnchoredReverse => { 557 match self.find_dfa_anchored_reverse(text, start) { 558 dfa::Result::Match((s, e)) => Some((s, e)), 559 dfa::Result::NoMatch(_) => None, 560 dfa::Result::Quit => { 561 self.find_nfa(MatchNfaType::Auto, text, start) 562 } 563 } 564 } 565 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 566 MatchType::DfaSuffix => { 567 match self.find_dfa_reverse_suffix(text, start) { 568 dfa::Result::Match((s, e)) => Some((s, e)), 569 dfa::Result::NoMatch(_) => None, 570 dfa::Result::Quit => { 571 self.find_nfa(MatchNfaType::Auto, text, start) 572 } 573 } 574 } 575 MatchType::Nfa(ty) => self.find_nfa(ty, text, start), 576 MatchType::Nothing => None, 577 #[cfg(feature = "perf-dfa")] 578 MatchType::DfaMany => { 579 unreachable!("BUG: RegexSet cannot be used with find") 580 } 581 } 582 } 583 584 /// Finds the start and end location of the leftmost-first match and also 585 /// fills in all matching capture groups. 586 /// 587 /// The number of capture slots given should be equal to the total number 588 /// of capture slots in the compiled program. 589 /// 590 /// Note that the first two slots always correspond to the start and end 591 /// locations of the overall match. captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>592 fn captures_read_at( 593 &self, 594 locs: &mut Locations, 595 text: &[u8], 596 start: usize, 597 ) -> Option<(usize, usize)> { 598 let slots = locs.as_slots(); 599 for slot in slots.iter_mut() { 600 *slot = None; 601 } 602 // If the caller unnecessarily uses this, then we try to save them 603 // from themselves. 604 match slots.len() { 605 0 => return self.find_at(text, start), 606 2 => { 607 return self.find_at(text, start).map(|(s, e)| { 608 slots[0] = Some(s); 609 slots[1] = Some(e); 610 (s, e) 611 }); 612 } 613 _ => {} // fallthrough 614 } 615 if !self.is_anchor_end_match(text) { 616 return None; 617 } 618 match self.ro.match_type { 619 #[cfg(feature = "perf-literal")] 620 MatchType::Literal(ty) => { 621 self.find_literals(ty, text, start).and_then(|(s, e)| { 622 self.captures_nfa_type( 623 MatchNfaType::Auto, 624 slots, 625 text, 626 s, 627 e, 628 ) 629 }) 630 } 631 #[cfg(feature = "perf-dfa")] 632 MatchType::Dfa => { 633 if self.ro.nfa.is_anchored_start { 634 self.captures_nfa(slots, text, start) 635 } else { 636 match self.find_dfa_forward(text, start) { 637 dfa::Result::Match((s, e)) => self.captures_nfa_type( 638 MatchNfaType::Auto, 639 slots, 640 text, 641 s, 642 e, 643 ), 644 dfa::Result::NoMatch(_) => None, 645 dfa::Result::Quit => { 646 self.captures_nfa(slots, text, start) 647 } 648 } 649 } 650 } 651 #[cfg(feature = "perf-dfa")] 652 MatchType::DfaAnchoredReverse => { 653 match self.find_dfa_anchored_reverse(text, start) { 654 dfa::Result::Match((s, e)) => self.captures_nfa_type( 655 MatchNfaType::Auto, 656 slots, 657 text, 658 s, 659 e, 660 ), 661 dfa::Result::NoMatch(_) => None, 662 dfa::Result::Quit => self.captures_nfa(slots, text, start), 663 } 664 } 665 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 666 MatchType::DfaSuffix => { 667 match self.find_dfa_reverse_suffix(text, start) { 668 dfa::Result::Match((s, e)) => self.captures_nfa_type( 669 MatchNfaType::Auto, 670 slots, 671 text, 672 s, 673 e, 674 ), 675 dfa::Result::NoMatch(_) => None, 676 dfa::Result::Quit => self.captures_nfa(slots, text, start), 677 } 678 } 679 MatchType::Nfa(ty) => { 680 self.captures_nfa_type(ty, slots, text, start, text.len()) 681 } 682 MatchType::Nothing => None, 683 #[cfg(feature = "perf-dfa")] 684 MatchType::DfaMany => { 685 unreachable!("BUG: RegexSet cannot be used with captures") 686 } 687 } 688 } 689 } 690 691 impl<'c> ExecNoSync<'c> { 692 /// Finds the leftmost-first match using only literal search. 693 #[cfg(feature = "perf-literal")] 694 #[cfg_attr(feature = "perf-inline", inline(always))] find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>695 fn find_literals( 696 &self, 697 ty: MatchLiteralType, 698 text: &[u8], 699 start: usize, 700 ) -> Option<(usize, usize)> { 701 use self::MatchLiteralType::*; 702 match ty { 703 Unanchored => { 704 let lits = &self.ro.nfa.prefixes; 705 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) 706 } 707 AnchoredStart => { 708 let lits = &self.ro.nfa.prefixes; 709 if start == 0 || !self.ro.nfa.is_anchored_start { 710 lits.find_start(&text[start..]) 711 .map(|(s, e)| (start + s, start + e)) 712 } else { 713 None 714 } 715 } 716 AnchoredEnd => { 717 let lits = &self.ro.suffixes; 718 lits.find_end(&text[start..]) 719 .map(|(s, e)| (start + s, start + e)) 720 } 721 AhoCorasick => self 722 .ro 723 .ac 724 .as_ref() 725 .unwrap() 726 .find(&text[start..]) 727 .map(|m| (start + m.start(), start + m.end())), 728 } 729 } 730 731 /// Finds the leftmost-first match (start and end) using only the DFA. 732 /// 733 /// If the result returned indicates that the DFA quit, then another 734 /// matching engine should be used. 735 #[cfg(feature = "perf-dfa")] 736 #[cfg_attr(feature = "perf-inline", inline(always))] find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>737 fn find_dfa_forward( 738 &self, 739 text: &[u8], 740 start: usize, 741 ) -> dfa::Result<(usize, usize)> { 742 use dfa::Result::*; 743 let end = match dfa::Fsm::forward( 744 &self.ro.dfa, 745 self.cache.value(), 746 false, 747 text, 748 start, 749 ) { 750 NoMatch(i) => return NoMatch(i), 751 Quit => return Quit, 752 Match(end) if start == end => return Match((start, start)), 753 Match(end) => end, 754 }; 755 // Now run the DFA in reverse to find the start of the match. 756 match dfa::Fsm::reverse( 757 &self.ro.dfa_reverse, 758 self.cache.value(), 759 false, 760 &text[start..], 761 end - start, 762 ) { 763 Match(s) => Match((start + s, end)), 764 NoMatch(i) => NoMatch(i), 765 Quit => Quit, 766 } 767 } 768 769 /// Finds the leftmost-first match (start and end) using only the DFA, 770 /// but assumes the regex is anchored at the end and therefore starts at 771 /// the end of the regex and matches in reverse. 772 /// 773 /// If the result returned indicates that the DFA quit, then another 774 /// matching engine should be used. 775 #[cfg(feature = "perf-dfa")] 776 #[cfg_attr(feature = "perf-inline", inline(always))] find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>777 fn find_dfa_anchored_reverse( 778 &self, 779 text: &[u8], 780 start: usize, 781 ) -> dfa::Result<(usize, usize)> { 782 use dfa::Result::*; 783 match dfa::Fsm::reverse( 784 &self.ro.dfa_reverse, 785 self.cache.value(), 786 false, 787 &text[start..], 788 text.len() - start, 789 ) { 790 Match(s) => Match((start + s, text.len())), 791 NoMatch(i) => NoMatch(i), 792 Quit => Quit, 793 } 794 } 795 796 /// Finds the end of the shortest match using only the DFA. 797 #[cfg(feature = "perf-dfa")] 798 #[cfg_attr(feature = "perf-inline", inline(always))] shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>799 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { 800 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) 801 } 802 803 /// Finds the end of the shortest match using only the DFA by scanning for 804 /// suffix literals. 805 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 806 #[cfg_attr(feature = "perf-inline", inline(always))] shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>807 fn shortest_dfa_reverse_suffix( 808 &self, 809 text: &[u8], 810 start: usize, 811 ) -> dfa::Result<usize> { 812 match self.exec_dfa_reverse_suffix(text, start) { 813 None => self.shortest_dfa(text, start), 814 Some(r) => r.map(|(_, end)| end), 815 } 816 } 817 818 /// Finds the end of the shortest match using only the DFA by scanning for 819 /// suffix literals. It also reports the start of the match. 820 /// 821 /// Note that if None is returned, then the optimization gave up to avoid 822 /// worst case quadratic behavior. A forward scanning DFA should be tried 823 /// next. 824 /// 825 /// If a match is returned and the full leftmost-first match is desired, 826 /// then a forward scan starting from the beginning of the match must be 827 /// done. 828 /// 829 /// If the result returned indicates that the DFA quit, then another 830 /// matching engine should be used. 831 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 832 #[cfg_attr(feature = "perf-inline", inline(always))] exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>833 fn exec_dfa_reverse_suffix( 834 &self, 835 text: &[u8], 836 original_start: usize, 837 ) -> Option<dfa::Result<(usize, usize)>> { 838 use dfa::Result::*; 839 840 let lcs = self.ro.suffixes.lcs(); 841 debug_assert!(lcs.len() >= 1); 842 let mut start = original_start; 843 let mut end = start; 844 let mut last_literal = start; 845 while end <= text.len() { 846 last_literal += match lcs.find(&text[last_literal..]) { 847 None => return Some(NoMatch(text.len())), 848 Some(i) => i, 849 }; 850 end = last_literal + lcs.len(); 851 match dfa::Fsm::reverse( 852 &self.ro.dfa_reverse, 853 self.cache.value(), 854 false, 855 &text[start..end], 856 end - start, 857 ) { 858 Match(0) | NoMatch(0) => return None, 859 Match(i) => return Some(Match((start + i, end))), 860 NoMatch(i) => { 861 start += i; 862 last_literal += 1; 863 continue; 864 } 865 Quit => return Some(Quit), 866 }; 867 } 868 Some(NoMatch(text.len())) 869 } 870 871 /// Finds the leftmost-first match (start and end) using only the DFA 872 /// by scanning for suffix literals. 873 /// 874 /// If the result returned indicates that the DFA quit, then another 875 /// matching engine should be used. 876 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 877 #[cfg_attr(feature = "perf-inline", inline(always))] find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>878 fn find_dfa_reverse_suffix( 879 &self, 880 text: &[u8], 881 start: usize, 882 ) -> dfa::Result<(usize, usize)> { 883 use dfa::Result::*; 884 885 let match_start = match self.exec_dfa_reverse_suffix(text, start) { 886 None => return self.find_dfa_forward(text, start), 887 Some(Match((start, _))) => start, 888 Some(r) => return r, 889 }; 890 // At this point, we've found a match. The only way to quit now 891 // without a match is if the DFA gives up (seems unlikely). 892 // 893 // Now run the DFA forwards to find the proper end of the match. 894 // (The suffix literal match can only indicate the earliest 895 // possible end location, which may appear before the end of the 896 // leftmost-first match.) 897 match dfa::Fsm::forward( 898 &self.ro.dfa, 899 self.cache.value(), 900 false, 901 text, 902 match_start, 903 ) { 904 NoMatch(_) => panic!("BUG: reverse match implies forward match"), 905 Quit => Quit, 906 Match(e) => Match((match_start, e)), 907 } 908 } 909 910 /// Executes the NFA engine to return whether there is a match or not. 911 /// 912 /// Ideally, we could use shortest_nfa(...).is_some() and get the same 913 /// performance characteristics, but regex sets don't have captures, which 914 /// shortest_nfa depends on. 915 #[cfg(feature = "perf-dfa")] match_nfa(&self, text: &[u8], start: usize) -> bool916 fn match_nfa(&self, text: &[u8], start: usize) -> bool { 917 self.match_nfa_type(MatchNfaType::Auto, text, start) 918 } 919 920 /// Like match_nfa, but allows specification of the type of NFA engine. match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool921 fn match_nfa_type( 922 &self, 923 ty: MatchNfaType, 924 text: &[u8], 925 start: usize, 926 ) -> bool { 927 self.exec_nfa( 928 ty, 929 &mut [false], 930 &mut [], 931 true, 932 false, 933 text, 934 start, 935 text.len(), 936 ) 937 } 938 939 /// Finds the shortest match using an NFA. 940 #[cfg(feature = "perf-dfa")] shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>941 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { 942 self.shortest_nfa_type(MatchNfaType::Auto, text, start) 943 } 944 945 /// Like shortest_nfa, but allows specification of the type of NFA engine. shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>946 fn shortest_nfa_type( 947 &self, 948 ty: MatchNfaType, 949 text: &[u8], 950 start: usize, 951 ) -> Option<usize> { 952 let mut slots = [None, None]; 953 if self.exec_nfa( 954 ty, 955 &mut [false], 956 &mut slots, 957 true, 958 true, 959 text, 960 start, 961 text.len(), 962 ) { 963 slots[1] 964 } else { 965 None 966 } 967 } 968 969 /// Like find, but executes an NFA engine. find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>970 fn find_nfa( 971 &self, 972 ty: MatchNfaType, 973 text: &[u8], 974 start: usize, 975 ) -> Option<(usize, usize)> { 976 let mut slots = [None, None]; 977 if self.exec_nfa( 978 ty, 979 &mut [false], 980 &mut slots, 981 false, 982 false, 983 text, 984 start, 985 text.len(), 986 ) { 987 match (slots[0], slots[1]) { 988 (Some(s), Some(e)) => Some((s, e)), 989 _ => None, 990 } 991 } else { 992 None 993 } 994 } 995 996 /// Like find_nfa, but fills in captures. 997 /// 998 /// `slots` should have length equal to `2 * nfa.captures.len()`. 999 #[cfg(feature = "perf-dfa")] captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>1000 fn captures_nfa( 1001 &self, 1002 slots: &mut [Slot], 1003 text: &[u8], 1004 start: usize, 1005 ) -> Option<(usize, usize)> { 1006 self.captures_nfa_type( 1007 MatchNfaType::Auto, 1008 slots, 1009 text, 1010 start, 1011 text.len(), 1012 ) 1013 } 1014 1015 /// Like captures_nfa, but allows specification of type of NFA engine. captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1016 fn captures_nfa_type( 1017 &self, 1018 ty: MatchNfaType, 1019 slots: &mut [Slot], 1020 text: &[u8], 1021 start: usize, 1022 end: usize, 1023 ) -> Option<(usize, usize)> { 1024 if self.exec_nfa( 1025 ty, 1026 &mut [false], 1027 slots, 1028 false, 1029 false, 1030 text, 1031 start, 1032 end, 1033 ) { 1034 match (slots[0], slots[1]) { 1035 (Some(s), Some(e)) => Some((s, e)), 1036 _ => None, 1037 } 1038 } else { 1039 None 1040 } 1041 } 1042 exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1043 fn exec_nfa( 1044 &self, 1045 mut ty: MatchNfaType, 1046 matches: &mut [bool], 1047 slots: &mut [Slot], 1048 quit_after_match: bool, 1049 quit_after_match_with_pos: bool, 1050 text: &[u8], 1051 start: usize, 1052 end: usize, 1053 ) -> bool { 1054 use self::MatchNfaType::*; 1055 if let Auto = ty { 1056 if backtrack::should_exec(self.ro.nfa.len(), text.len()) { 1057 ty = Backtrack; 1058 } else { 1059 ty = PikeVM; 1060 } 1061 } 1062 // The backtracker can't return the shortest match position as it is 1063 // implemented today. So if someone calls `shortest_match` and we need 1064 // to run an NFA, then use the PikeVM. 1065 if quit_after_match_with_pos || ty == PikeVM { 1066 self.exec_pikevm( 1067 matches, 1068 slots, 1069 quit_after_match, 1070 text, 1071 start, 1072 end, 1073 ) 1074 } else { 1075 self.exec_backtrack(matches, slots, text, start, end) 1076 } 1077 } 1078 1079 /// Always run the NFA algorithm. exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1080 fn exec_pikevm( 1081 &self, 1082 matches: &mut [bool], 1083 slots: &mut [Slot], 1084 quit_after_match: bool, 1085 text: &[u8], 1086 start: usize, 1087 end: usize, 1088 ) -> bool { 1089 if self.ro.nfa.uses_bytes() { 1090 pikevm::Fsm::exec( 1091 &self.ro.nfa, 1092 self.cache.value(), 1093 matches, 1094 slots, 1095 quit_after_match, 1096 ByteInput::new(text, self.ro.nfa.only_utf8), 1097 start, 1098 end, 1099 ) 1100 } else { 1101 pikevm::Fsm::exec( 1102 &self.ro.nfa, 1103 self.cache.value(), 1104 matches, 1105 slots, 1106 quit_after_match, 1107 CharInput::new(text), 1108 start, 1109 end, 1110 ) 1111 } 1112 } 1113 1114 /// Always runs the NFA using bounded backtracking. exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1115 fn exec_backtrack( 1116 &self, 1117 matches: &mut [bool], 1118 slots: &mut [Slot], 1119 text: &[u8], 1120 start: usize, 1121 end: usize, 1122 ) -> bool { 1123 if self.ro.nfa.uses_bytes() { 1124 backtrack::Bounded::exec( 1125 &self.ro.nfa, 1126 self.cache.value(), 1127 matches, 1128 slots, 1129 ByteInput::new(text, self.ro.nfa.only_utf8), 1130 start, 1131 end, 1132 ) 1133 } else { 1134 backtrack::Bounded::exec( 1135 &self.ro.nfa, 1136 self.cache.value(), 1137 matches, 1138 slots, 1139 CharInput::new(text), 1140 start, 1141 end, 1142 ) 1143 } 1144 } 1145 1146 /// Finds which regular expressions match the given text. 1147 /// 1148 /// `matches` should have length equal to the number of regexes being 1149 /// searched. 1150 /// 1151 /// This is only useful when one wants to know which regexes in a set 1152 /// match some text. many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1153 pub fn many_matches_at( 1154 &self, 1155 matches: &mut [bool], 1156 text: &[u8], 1157 start: usize, 1158 ) -> bool { 1159 use self::MatchType::*; 1160 if !self.is_anchor_end_match(text) { 1161 return false; 1162 } 1163 match self.ro.match_type { 1164 #[cfg(feature = "perf-literal")] 1165 Literal(ty) => { 1166 debug_assert_eq!(matches.len(), 1); 1167 matches[0] = self.find_literals(ty, text, start).is_some(); 1168 matches[0] 1169 } 1170 #[cfg(feature = "perf-dfa")] 1171 Dfa | DfaAnchoredReverse | DfaMany => { 1172 match dfa::Fsm::forward_many( 1173 &self.ro.dfa, 1174 self.cache.value(), 1175 matches, 1176 text, 1177 start, 1178 ) { 1179 dfa::Result::Match(_) => true, 1180 dfa::Result::NoMatch(_) => false, 1181 dfa::Result::Quit => self.exec_nfa( 1182 MatchNfaType::Auto, 1183 matches, 1184 &mut [], 1185 false, 1186 false, 1187 text, 1188 start, 1189 text.len(), 1190 ), 1191 } 1192 } 1193 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1194 DfaSuffix => { 1195 match dfa::Fsm::forward_many( 1196 &self.ro.dfa, 1197 self.cache.value(), 1198 matches, 1199 text, 1200 start, 1201 ) { 1202 dfa::Result::Match(_) => true, 1203 dfa::Result::NoMatch(_) => false, 1204 dfa::Result::Quit => self.exec_nfa( 1205 MatchNfaType::Auto, 1206 matches, 1207 &mut [], 1208 false, 1209 false, 1210 text, 1211 start, 1212 text.len(), 1213 ), 1214 } 1215 } 1216 Nfa(ty) => self.exec_nfa( 1217 ty, 1218 matches, 1219 &mut [], 1220 false, 1221 false, 1222 text, 1223 start, 1224 text.len(), 1225 ), 1226 Nothing => false, 1227 } 1228 } 1229 1230 #[cfg_attr(feature = "perf-inline", inline(always))] is_anchor_end_match(&self, text: &[u8]) -> bool1231 fn is_anchor_end_match(&self, text: &[u8]) -> bool { 1232 #[cfg(not(feature = "perf-literal"))] 1233 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { 1234 true 1235 } 1236 1237 #[cfg(feature = "perf-literal")] 1238 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { 1239 // Only do this check if the haystack is big (>1MB). 1240 if text.len() > (1 << 20) && ro.nfa.is_anchored_end { 1241 let lcs = ro.suffixes.lcs(); 1242 if lcs.len() >= 1 && !lcs.is_suffix(text) { 1243 return false; 1244 } 1245 } 1246 true 1247 } 1248 1249 imp(&self.ro, text) 1250 } 1251 capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1252 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1253 &self.ro.nfa.capture_name_idx 1254 } 1255 } 1256 1257 impl<'c> ExecNoSyncStr<'c> { capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1258 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1259 self.0.capture_name_idx() 1260 } 1261 } 1262 1263 impl Exec { 1264 /// Get a searcher that isn't Sync. 1265 #[cfg_attr(feature = "perf-inline", inline(always))] searcher(&self) -> ExecNoSync1266 pub fn searcher(&self) -> ExecNoSync { 1267 ExecNoSync { 1268 ro: &self.ro, // a clone is too expensive here! (and not needed) 1269 cache: self.pool.get(), 1270 } 1271 } 1272 1273 /// Get a searcher that isn't Sync and can match on &str. 1274 #[cfg_attr(feature = "perf-inline", inline(always))] searcher_str(&self) -> ExecNoSyncStr1275 pub fn searcher_str(&self) -> ExecNoSyncStr { 1276 ExecNoSyncStr(self.searcher()) 1277 } 1278 1279 /// Build a Regex from this executor. into_regex(self) -> re_unicode::Regex1280 pub fn into_regex(self) -> re_unicode::Regex { 1281 re_unicode::Regex::from(self) 1282 } 1283 1284 /// Build a RegexSet from this executor. into_regex_set(self) -> re_set::unicode::RegexSet1285 pub fn into_regex_set(self) -> re_set::unicode::RegexSet { 1286 re_set::unicode::RegexSet::from(self) 1287 } 1288 1289 /// Build a Regex from this executor that can match arbitrary bytes. into_byte_regex(self) -> re_bytes::Regex1290 pub fn into_byte_regex(self) -> re_bytes::Regex { 1291 re_bytes::Regex::from(self) 1292 } 1293 1294 /// Build a RegexSet from this executor that can match arbitrary bytes. into_byte_regex_set(self) -> re_set::bytes::RegexSet1295 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { 1296 re_set::bytes::RegexSet::from(self) 1297 } 1298 1299 /// The original regular expressions given by the caller that were 1300 /// compiled. regex_strings(&self) -> &[String]1301 pub fn regex_strings(&self) -> &[String] { 1302 &self.ro.res 1303 } 1304 1305 /// Return a slice of capture names. 1306 /// 1307 /// Any capture that isn't named is None. capture_names(&self) -> &[Option<String>]1308 pub fn capture_names(&self) -> &[Option<String>] { 1309 &self.ro.nfa.captures 1310 } 1311 1312 /// Return a reference to named groups mapping (from group name to 1313 /// group position). capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1314 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1315 &self.ro.nfa.capture_name_idx 1316 } 1317 } 1318 1319 impl Clone for Exec { clone(&self) -> Exec1320 fn clone(&self) -> Exec { 1321 let pool = ExecReadOnly::new_pool(&self.ro); 1322 Exec { ro: self.ro.clone(), pool } 1323 } 1324 } 1325 1326 impl ExecReadOnly { choose_match_type(&self, hint: Option<MatchType>) -> MatchType1327 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { 1328 if let Some(MatchType::Nfa(_)) = hint { 1329 return hint.unwrap(); 1330 } 1331 // If the NFA is empty, then we'll never match anything. 1332 if self.nfa.insts.is_empty() { 1333 return MatchType::Nothing; 1334 } 1335 if let Some(literalty) = self.choose_literal_match_type() { 1336 return literalty; 1337 } 1338 if let Some(dfaty) = self.choose_dfa_match_type() { 1339 return dfaty; 1340 } 1341 // We're so totally hosed. 1342 MatchType::Nfa(MatchNfaType::Auto) 1343 } 1344 1345 /// If a plain literal scan can be used, then a corresponding literal 1346 /// search type is returned. choose_literal_match_type(&self) -> Option<MatchType>1347 fn choose_literal_match_type(&self) -> Option<MatchType> { 1348 #[cfg(not(feature = "perf-literal"))] 1349 fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1350 None 1351 } 1352 1353 #[cfg(feature = "perf-literal")] 1354 fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1355 // If our set of prefixes is complete, then we can use it to find 1356 // a match in lieu of a regex engine. This doesn't quite work well 1357 // in the presence of multiple regexes, so only do it when there's 1358 // one. 1359 // 1360 // TODO(burntsushi): Also, don't try to match literals if the regex 1361 // is partially anchored. We could technically do it, but we'd need 1362 // to create two sets of literals: all of them and then the subset 1363 // that aren't anchored. We would then only search for all of them 1364 // when at the beginning of the input and use the subset in all 1365 // other cases. 1366 if ro.res.len() != 1 { 1367 return None; 1368 } 1369 if ro.ac.is_some() { 1370 return Some(MatchType::Literal( 1371 MatchLiteralType::AhoCorasick, 1372 )); 1373 } 1374 if ro.nfa.prefixes.complete() { 1375 return if ro.nfa.is_anchored_start { 1376 Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) 1377 } else { 1378 Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1379 }; 1380 } 1381 if ro.suffixes.complete() { 1382 return if ro.nfa.is_anchored_end { 1383 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) 1384 } else { 1385 // This case shouldn't happen. When the regex isn't 1386 // anchored, then complete prefixes should imply complete 1387 // suffixes. 1388 Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1389 }; 1390 } 1391 None 1392 } 1393 1394 imp(self) 1395 } 1396 1397 /// If a DFA scan can be used, then choose the appropriate DFA strategy. choose_dfa_match_type(&self) -> Option<MatchType>1398 fn choose_dfa_match_type(&self) -> Option<MatchType> { 1399 #[cfg(not(feature = "perf-dfa"))] 1400 fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1401 None 1402 } 1403 1404 #[cfg(feature = "perf-dfa")] 1405 fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1406 if !dfa::can_exec(&ro.dfa) { 1407 return None; 1408 } 1409 // Regex sets require a slightly specialized path. 1410 if ro.res.len() >= 2 { 1411 return Some(MatchType::DfaMany); 1412 } 1413 // If the regex is anchored at the end but not the start, then 1414 // just match in reverse from the end of the haystack. 1415 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { 1416 return Some(MatchType::DfaAnchoredReverse); 1417 } 1418 #[cfg(feature = "perf-literal")] 1419 { 1420 // If there's a longish suffix literal, then it might be faster 1421 // to look for that first. 1422 if ro.should_suffix_scan() { 1423 return Some(MatchType::DfaSuffix); 1424 } 1425 } 1426 // Fall back to your garden variety forward searching lazy DFA. 1427 Some(MatchType::Dfa) 1428 } 1429 1430 imp(self) 1431 } 1432 1433 /// Returns true if the program is amenable to suffix scanning. 1434 /// 1435 /// When this is true, as a heuristic, we assume it is OK to quickly scan 1436 /// for suffix literals and then do a *reverse* DFA match from any matches 1437 /// produced by the literal scan. (And then followed by a forward DFA 1438 /// search, since the previously found suffix literal maybe not actually be 1439 /// the end of a match.) 1440 /// 1441 /// This is a bit of a specialized optimization, but can result in pretty 1442 /// big performance wins if 1) there are no prefix literals and 2) the 1443 /// suffix literals are pretty rare in the text. (1) is obviously easy to 1444 /// account for but (2) is harder. As a proxy, we assume that longer 1445 /// strings are generally rarer, so we only enable this optimization when 1446 /// we have a meaty suffix. 1447 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] should_suffix_scan(&self) -> bool1448 fn should_suffix_scan(&self) -> bool { 1449 if self.suffixes.is_empty() { 1450 return false; 1451 } 1452 let lcs_len = self.suffixes.lcs().char_len(); 1453 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() 1454 } 1455 new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1456 fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { 1457 let ro = ro.clone(); 1458 Box::new(Pool::new(Box::new(move || { 1459 AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) 1460 }))) 1461 } 1462 } 1463 1464 #[derive(Clone, Copy, Debug)] 1465 enum MatchType { 1466 /// A single or multiple literal search. This is only used when the regex 1467 /// can be decomposed into a literal search. 1468 #[cfg(feature = "perf-literal")] 1469 Literal(MatchLiteralType), 1470 /// A normal DFA search. 1471 #[cfg(feature = "perf-dfa")] 1472 Dfa, 1473 /// A reverse DFA search starting from the end of a haystack. 1474 #[cfg(feature = "perf-dfa")] 1475 DfaAnchoredReverse, 1476 /// A reverse DFA search with suffix literal scanning. 1477 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1478 DfaSuffix, 1479 /// Use the DFA on two or more regular expressions. 1480 #[cfg(feature = "perf-dfa")] 1481 DfaMany, 1482 /// An NFA variant. 1483 Nfa(MatchNfaType), 1484 /// No match is ever possible, so don't ever try to search. 1485 Nothing, 1486 } 1487 1488 #[derive(Clone, Copy, Debug)] 1489 #[cfg(feature = "perf-literal")] 1490 enum MatchLiteralType { 1491 /// Match literals anywhere in text. 1492 Unanchored, 1493 /// Match literals only at the start of text. 1494 AnchoredStart, 1495 /// Match literals only at the end of text. 1496 AnchoredEnd, 1497 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on 1498 /// ExecReadOnly. 1499 AhoCorasick, 1500 } 1501 1502 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 1503 enum MatchNfaType { 1504 /// Choose between Backtrack and PikeVM. 1505 Auto, 1506 /// NFA bounded backtracking. 1507 /// 1508 /// (This is only set by tests, since it never makes sense to always want 1509 /// backtracking.) 1510 Backtrack, 1511 /// The Pike VM. 1512 /// 1513 /// (This is only set by tests, since it never makes sense to always want 1514 /// the Pike VM.) 1515 PikeVM, 1516 } 1517 1518 /// `ProgramCache` maintains reusable allocations for each matching engine 1519 /// available to a particular program. 1520 /// 1521 /// We declare this as unwind safe since it's a cache that's only used for 1522 /// performance purposes. If a panic occurs, it is (or should be) always safe 1523 /// to continue using the same regex object. 1524 pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; 1525 1526 #[derive(Debug)] 1527 pub struct ProgramCacheInner { 1528 pub pikevm: pikevm::Cache, 1529 pub backtrack: backtrack::Cache, 1530 #[cfg(feature = "perf-dfa")] 1531 pub dfa: dfa::Cache, 1532 #[cfg(feature = "perf-dfa")] 1533 pub dfa_reverse: dfa::Cache, 1534 } 1535 1536 impl ProgramCacheInner { new(ro: &ExecReadOnly) -> Self1537 fn new(ro: &ExecReadOnly) -> Self { 1538 ProgramCacheInner { 1539 pikevm: pikevm::Cache::new(&ro.nfa), 1540 backtrack: backtrack::Cache::new(&ro.nfa), 1541 #[cfg(feature = "perf-dfa")] 1542 dfa: dfa::Cache::new(&ro.dfa), 1543 #[cfg(feature = "perf-dfa")] 1544 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), 1545 } 1546 } 1547 } 1548 1549 /// Alternation literals checks if the given HIR is a simple alternation of 1550 /// literals, and if so, returns them. Otherwise, this returns None. 1551 #[cfg(feature = "perf-literal")] alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1552 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { 1553 use syntax::hir::{HirKind, Literal}; 1554 1555 // This is pretty hacky, but basically, if `is_alternation_literal` is 1556 // true, then we can make several assumptions about the structure of our 1557 // HIR. This is what justifies the `unreachable!` statements below. 1558 // 1559 // This code should be refactored once we overhaul this crate's 1560 // optimization pipeline, because this is a terribly inflexible way to go 1561 // about things. 1562 1563 if !expr.is_alternation_literal() { 1564 return None; 1565 } 1566 let alts = match *expr.kind() { 1567 HirKind::Alternation(ref alts) => alts, 1568 _ => return None, // one literal isn't worth it 1569 }; 1570 1571 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { 1572 Literal::Unicode(c) => { 1573 let mut buf = [0; 4]; 1574 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); 1575 } 1576 Literal::Byte(b) => { 1577 dst.push(b); 1578 } 1579 }; 1580 1581 let mut lits = vec![]; 1582 for alt in alts { 1583 let mut lit = vec![]; 1584 match *alt.kind() { 1585 HirKind::Literal(ref x) => extendlit(x, &mut lit), 1586 HirKind::Concat(ref exprs) => { 1587 for e in exprs { 1588 match *e.kind() { 1589 HirKind::Literal(ref x) => extendlit(x, &mut lit), 1590 _ => unreachable!("expected literal, got {:?}", e), 1591 } 1592 } 1593 } 1594 _ => unreachable!("expected literal or concat, got {:?}", alt), 1595 } 1596 lits.push(lit); 1597 } 1598 Some(lits) 1599 } 1600 1601 #[cfg(test)] 1602 mod test { 1603 #[test] uppercut_s_backtracking_bytes_default_bytes_mismatch()1604 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { 1605 use internal::ExecBuilder; 1606 1607 let backtrack_bytes_re = ExecBuilder::new("^S") 1608 .bounded_backtracking() 1609 .only_utf8(false) 1610 .build() 1611 .map(|exec| exec.into_byte_regex()) 1612 .map_err(|err| format!("{}", err)) 1613 .unwrap(); 1614 1615 let default_bytes_re = ExecBuilder::new("^S") 1616 .only_utf8(false) 1617 .build() 1618 .map(|exec| exec.into_byte_regex()) 1619 .map_err(|err| format!("{}", err)) 1620 .unwrap(); 1621 1622 let input = vec![83, 83]; 1623 1624 let s1 = backtrack_bytes_re.split(&input); 1625 let s2 = default_bytes_re.split(&input); 1626 for (chunk1, chunk2) in s1.zip(s2) { 1627 assert_eq!(chunk1, chunk2); 1628 } 1629 } 1630 1631 #[test] unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1632 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { 1633 use internal::ExecBuilder; 1634 1635 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1636 .bounded_backtracking() 1637 .bytes(true) 1638 .build() 1639 .map(|exec| exec.into_regex()) 1640 .map_err(|err| format!("{}", err)) 1641 .unwrap(); 1642 1643 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1644 .bytes(true) 1645 .build() 1646 .map(|exec| exec.into_regex()) 1647 .map_err(|err| format!("{}", err)) 1648 .unwrap(); 1649 1650 let input = "**"; 1651 1652 let s1 = backtrack_bytes_re.split(input); 1653 let s2 = default_bytes_re.split(input); 1654 for (chunk1, chunk2) in s1.zip(s2) { 1655 assert_eq!(chunk1, chunk2); 1656 } 1657 } 1658 } 1659