1 use core::cmp; 2 3 use cow::CowBytes; 4 use ext_slice::ByteSlice; 5 use search::prefilter::{Freqy, PrefilterState}; 6 7 /// An implementation of the TwoWay substring search algorithm, with heuristics 8 /// for accelerating search based on frequency analysis. 9 /// 10 /// This searcher supports forward and reverse search, although not 11 /// simultaneously. It runs in O(n + m) time and O(1) space, where 12 /// `n ~ len(needle)` and `m ~ len(haystack)`. 13 /// 14 /// The implementation here roughly matches that which was developed by 15 /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The 16 /// only change in this implementation is the use of zero-based indices and 17 /// the addition of heuristics for a fast skip loop. That is, this will detect 18 /// bytes that are believed to be rare in the needle and use fast vectorized 19 /// instructions to find their occurrences quickly. The Two-Way algorithm is 20 /// then used to confirm whether a match at that location occurred. 21 /// 22 /// The heuristic for fast skipping is automatically shut off if it's 23 /// detected to be ineffective at search time. Generally, this only occurs in 24 /// pathological cases. But this is generally necessary in order to preserve 25 /// a `O(n + m)` time bound. 26 /// 27 /// The code below is fairly complex and not obviously correct at all. It's 28 /// likely necessary to read the Two-Way paper cited above in order to fully 29 /// grok this code. 30 #[derive(Clone, Debug)] 31 pub struct TwoWay<'b> { 32 /// The needle that we're looking for. 33 needle: CowBytes<'b>, 34 /// An implementation of a fast skip loop based on hard-coded frequency 35 /// data. This is only used when conditions are deemed favorable. 36 freqy: Freqy, 37 /// A critical position in needle. Specifically, this position corresponds 38 /// to beginning of either the minimal or maximal suffix in needle. (N.B. 39 /// See SuffixType below for why "minimal" isn't quite the correct word 40 /// here.) 41 /// 42 /// This is the position at which every search begins. Namely, search 43 /// starts by scanning text to the right of this position, and only if 44 /// there's a match does the text to the left of this position get scanned. 45 critical_pos: usize, 46 /// The amount we shift by in the Two-Way search algorithm. This 47 /// corresponds to the "small period" and "large period" cases. 48 shift: Shift, 49 } 50 51 impl<'b> TwoWay<'b> { 52 /// Create a searcher that uses the Two-Way algorithm by searching forwards 53 /// through any haystack. forward(needle: &'b [u8]) -> TwoWay<'b>54 pub fn forward(needle: &'b [u8]) -> TwoWay<'b> { 55 let freqy = Freqy::forward(needle); 56 if needle.is_empty() { 57 return TwoWay { 58 needle: CowBytes::new(needle), 59 freqy, 60 critical_pos: 0, 61 shift: Shift::Large { shift: 0 }, 62 }; 63 } 64 65 let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); 66 let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); 67 let (period_lower_bound, critical_pos) = 68 if min_suffix.pos > max_suffix.pos { 69 (min_suffix.period, min_suffix.pos) 70 } else { 71 (max_suffix.period, max_suffix.pos) 72 }; 73 let shift = Shift::forward(needle, period_lower_bound, critical_pos); 74 let needle = CowBytes::new(needle); 75 TwoWay { needle, freqy, critical_pos, shift } 76 } 77 78 /// Create a searcher that uses the Two-Way algorithm by searching in 79 /// reverse through any haystack. reverse(needle: &'b [u8]) -> TwoWay<'b>80 pub fn reverse(needle: &'b [u8]) -> TwoWay<'b> { 81 let freqy = Freqy::reverse(needle); 82 if needle.is_empty() { 83 return TwoWay { 84 needle: CowBytes::new(needle), 85 freqy, 86 critical_pos: 0, 87 shift: Shift::Large { shift: 0 }, 88 }; 89 } 90 91 let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); 92 let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); 93 let (period_lower_bound, critical_pos) = 94 if min_suffix.pos < max_suffix.pos { 95 (min_suffix.period, min_suffix.pos) 96 } else { 97 (max_suffix.period, max_suffix.pos) 98 }; 99 let shift = Shift::reverse(needle, period_lower_bound, critical_pos); 100 let needle = CowBytes::new(needle); 101 TwoWay { needle, freqy, critical_pos, shift } 102 } 103 104 /// Return a fresh prefilter state that can be used with this searcher. 105 /// A prefilter state is used to track the effectiveness of a searcher's 106 /// prefilter for speeding up searches. Therefore, the prefilter state 107 /// should generally be reused on subsequent searches (such as in an 108 /// iterator). For searches on a different haystack, then a new prefilter 109 /// state should be used. 110 /// 111 /// This always initializes a valid prefilter state even if this searcher 112 /// does not have a prefilter enabled. prefilter_state(&self) -> PrefilterState113 pub fn prefilter_state(&self) -> PrefilterState { 114 self.freqy.prefilter_state() 115 } 116 117 /// Return the needle used by this searcher. needle(&self) -> &[u8]118 pub fn needle(&self) -> &[u8] { 119 self.needle.as_slice() 120 } 121 122 /// Convert this searched into an owned version, where the needle is 123 /// copied if it isn't already owned. 124 #[cfg(feature = "std")] into_owned(self) -> TwoWay<'static>125 pub fn into_owned(self) -> TwoWay<'static> { 126 TwoWay { 127 needle: self.needle.into_owned(), 128 freqy: self.freqy, 129 critical_pos: self.critical_pos, 130 shift: self.shift, 131 } 132 } 133 134 /// Find the position of the first occurrence of this searcher's needle in 135 /// the given haystack. If one does not exist, then return None. 136 /// 137 /// This will automatically initialize prefilter state. This should only 138 /// be used for one-off searches. find(&self, haystack: &[u8]) -> Option<usize>139 pub fn find(&self, haystack: &[u8]) -> Option<usize> { 140 self.find_with(&mut self.prefilter_state(), haystack) 141 } 142 143 /// Find the position of the last occurrence of this searcher's needle 144 /// in the given haystack. If one does not exist, then return None. 145 /// 146 /// This will automatically initialize prefilter state. This should only 147 /// be used for one-off searches. rfind(&self, haystack: &[u8]) -> Option<usize>148 pub fn rfind(&self, haystack: &[u8]) -> Option<usize> { 149 self.rfind_with(&mut self.prefilter_state(), haystack) 150 } 151 152 /// Find the position of the first occurrence of this searcher's needle in 153 /// the given haystack. If one does not exist, then return None. 154 /// 155 /// This accepts prefilter state that is useful when using the same 156 /// searcher multiple times, such as in an iterator. find_with( &self, prestate: &mut PrefilterState, haystack: &[u8], ) -> Option<usize>157 pub fn find_with( 158 &self, 159 prestate: &mut PrefilterState, 160 haystack: &[u8], 161 ) -> Option<usize> { 162 if self.needle.is_empty() { 163 return Some(0); 164 } else if haystack.len() < self.needle.len() { 165 return None; 166 } else if self.needle.len() == 1 { 167 return haystack.find_byte(self.needle[0]); 168 } 169 match self.shift { 170 Shift::Small { period } => { 171 self.find_small(prestate, haystack, period) 172 } 173 Shift::Large { shift } => { 174 self.find_large(prestate, haystack, shift) 175 } 176 } 177 } 178 179 /// Find the position of the last occurrence of this searcher's needle 180 /// in the given haystack. If one does not exist, then return None. 181 /// 182 /// This accepts prefilter state that is useful when using the same 183 /// searcher multiple times, such as in an iterator. rfind_with( &self, prestate: &mut PrefilterState, haystack: &[u8], ) -> Option<usize>184 pub fn rfind_with( 185 &self, 186 prestate: &mut PrefilterState, 187 haystack: &[u8], 188 ) -> Option<usize> { 189 if self.needle.is_empty() { 190 return Some(haystack.len()); 191 } else if haystack.len() < self.needle.len() { 192 return None; 193 } else if self.needle.len() == 1 { 194 return haystack.rfind_byte(self.needle[0]); 195 } 196 match self.shift { 197 Shift::Small { period } => { 198 self.rfind_small(prestate, haystack, period) 199 } 200 Shift::Large { shift } => { 201 self.rfind_large(prestate, haystack, shift) 202 } 203 } 204 } 205 206 // Below is the actual implementation of TwoWay searching, including both 207 // forwards and backwards searching. Each forward and reverse search has 208 // two fairly similar implementations, each handling the small and large 209 // period cases, for a total 4 different search routines. 210 // 211 // On top of that, each search implementation can be accelerated by a 212 // Freqy prefilter, but it is not always enabled. To avoid its overhead 213 // when its disabled, we explicitly inline each search implementation based 214 // on whether Freqy will be used or not. This brings us up to a total of 215 // 8 monomorphized versions of the search code. 216 217 #[inline(never)] find_small( &self, prestate: &mut PrefilterState, haystack: &[u8], period: usize, ) -> Option<usize>218 fn find_small( 219 &self, 220 prestate: &mut PrefilterState, 221 haystack: &[u8], 222 period: usize, 223 ) -> Option<usize> { 224 if prestate.is_effective() { 225 self.find_small_imp(prestate, true, haystack, period) 226 } else { 227 self.find_small_imp(prestate, false, haystack, period) 228 } 229 } 230 231 #[inline(always)] find_small_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], period: usize, ) -> Option<usize>232 fn find_small_imp( 233 &self, 234 prestate: &mut PrefilterState, 235 prefilter: bool, 236 haystack: &[u8], 237 period: usize, 238 ) -> Option<usize> { 239 let needle = self.needle.as_slice(); 240 let mut pos = 0; 241 let mut shift = 0; 242 while pos + needle.len() <= haystack.len() { 243 let mut i = cmp::max(self.critical_pos, shift); 244 if prefilter && prestate.is_effective() { 245 match self.freqy.find_candidate(prestate, &haystack[pos..]) { 246 None => return None, 247 Some(found) => { 248 shift = 0; 249 i = self.critical_pos; 250 pos += found; 251 if pos + needle.len() > haystack.len() { 252 return None; 253 } 254 } 255 } 256 } 257 while i < needle.len() && needle[i] == haystack[pos + i] { 258 i += 1; 259 } 260 if i < needle.len() { 261 pos += i - self.critical_pos + 1; 262 shift = 0; 263 } else { 264 let mut j = self.critical_pos; 265 while j > shift && needle[j] == haystack[pos + j] { 266 j -= 1; 267 } 268 if j <= shift && needle[shift] == haystack[pos + shift] { 269 return Some(pos); 270 } 271 pos += period; 272 shift = needle.len() - period; 273 } 274 } 275 None 276 } 277 278 #[inline(never)] find_large( &self, prestate: &mut PrefilterState, haystack: &[u8], shift: usize, ) -> Option<usize>279 fn find_large( 280 &self, 281 prestate: &mut PrefilterState, 282 haystack: &[u8], 283 shift: usize, 284 ) -> Option<usize> { 285 if prestate.is_effective() { 286 self.find_large_imp(prestate, true, haystack, shift) 287 } else { 288 self.find_large_imp(prestate, false, haystack, shift) 289 } 290 } 291 292 #[inline(always)] find_large_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], shift: usize, ) -> Option<usize>293 fn find_large_imp( 294 &self, 295 prestate: &mut PrefilterState, 296 prefilter: bool, 297 haystack: &[u8], 298 shift: usize, 299 ) -> Option<usize> { 300 let needle = self.needle.as_slice(); 301 let mut pos = 0; 302 while pos + needle.len() <= haystack.len() { 303 let mut i = self.critical_pos; 304 if prefilter && prestate.is_effective() { 305 match self.freqy.find_candidate(prestate, &haystack[pos..]) { 306 None => return None, 307 Some(found) => { 308 pos += found; 309 if pos + needle.len() > haystack.len() { 310 return None; 311 } 312 } 313 } 314 } 315 while i < needle.len() && needle[i] == haystack[pos + i] { 316 i += 1; 317 } 318 if i < needle.len() { 319 pos += i - self.critical_pos + 1; 320 } else { 321 let mut j = self.critical_pos; 322 while j > 0 && needle[j] == haystack[pos + j] { 323 j -= 1; 324 } 325 if j == 0 && needle[0] == haystack[pos] { 326 return Some(pos); 327 } 328 pos += shift; 329 } 330 } 331 None 332 } 333 334 #[inline(never)] rfind_small( &self, prestate: &mut PrefilterState, haystack: &[u8], period: usize, ) -> Option<usize>335 fn rfind_small( 336 &self, 337 prestate: &mut PrefilterState, 338 haystack: &[u8], 339 period: usize, 340 ) -> Option<usize> { 341 if prestate.is_effective() { 342 self.rfind_small_imp(prestate, true, haystack, period) 343 } else { 344 self.rfind_small_imp(prestate, false, haystack, period) 345 } 346 } 347 348 #[inline(always)] rfind_small_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], period: usize, ) -> Option<usize>349 fn rfind_small_imp( 350 &self, 351 prestate: &mut PrefilterState, 352 prefilter: bool, 353 haystack: &[u8], 354 period: usize, 355 ) -> Option<usize> { 356 let needle = &*self.needle; 357 let nlen = needle.len(); 358 let mut pos = haystack.len(); 359 let mut shift = nlen; 360 while pos >= nlen { 361 let mut i = cmp::min(self.critical_pos, shift); 362 if prefilter && prestate.is_effective() { 363 match self.freqy.rfind_candidate(prestate, &haystack[..pos]) { 364 None => return None, 365 Some(found) => { 366 shift = nlen; 367 i = self.critical_pos; 368 pos = found; 369 if pos < nlen { 370 return None; 371 } 372 } 373 } 374 } 375 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { 376 i -= 1; 377 } 378 if i > 0 || needle[0] != haystack[pos - nlen] { 379 pos -= self.critical_pos - i + 1; 380 shift = nlen; 381 } else { 382 let mut j = self.critical_pos; 383 while j < shift && needle[j] == haystack[pos - nlen + j] { 384 j += 1; 385 } 386 if j == shift { 387 return Some(pos - nlen); 388 } 389 pos -= period; 390 shift = period; 391 } 392 } 393 None 394 } 395 396 #[inline(never)] rfind_large( &self, prestate: &mut PrefilterState, haystack: &[u8], shift: usize, ) -> Option<usize>397 fn rfind_large( 398 &self, 399 prestate: &mut PrefilterState, 400 haystack: &[u8], 401 shift: usize, 402 ) -> Option<usize> { 403 if prestate.is_effective() { 404 self.rfind_large_imp(prestate, true, haystack, shift) 405 } else { 406 self.rfind_large_imp(prestate, false, haystack, shift) 407 } 408 } 409 410 #[inline(always)] rfind_large_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], shift: usize, ) -> Option<usize>411 fn rfind_large_imp( 412 &self, 413 prestate: &mut PrefilterState, 414 prefilter: bool, 415 haystack: &[u8], 416 shift: usize, 417 ) -> Option<usize> { 418 let needle = &*self.needle; 419 let nlen = needle.len(); 420 let mut pos = haystack.len(); 421 while pos >= nlen { 422 if prefilter && prestate.is_effective() { 423 match self.freqy.rfind_candidate(prestate, &haystack[..pos]) { 424 None => return None, 425 Some(found) => { 426 pos = found; 427 if pos < nlen { 428 return None; 429 } 430 } 431 } 432 } 433 434 let mut i = self.critical_pos; 435 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { 436 i -= 1; 437 } 438 if i > 0 || needle[0] != haystack[pos - nlen] { 439 pos -= self.critical_pos - i + 1; 440 } else { 441 let mut j = self.critical_pos; 442 while j < nlen && needle[j] == haystack[pos - nlen + j] { 443 j += 1; 444 } 445 if j == nlen { 446 return Some(pos - nlen); 447 } 448 pos -= shift; 449 } 450 } 451 None 452 } 453 } 454 455 /// A representation of the amount we're allowed to shift by during Two-Way 456 /// search. 457 /// 458 /// When computing a critical factorization of the needle, we find the position 459 /// of the critical factorization by finding the needle's maximal (or minimal) 460 /// suffix, along with the period of that suffix. It turns out that the period 461 /// of that suffix is a lower bound on the period of the needle itself. 462 /// 463 /// This lower bound is equivalent to the actual period of the needle in 464 /// some cases. To describe that case, we denote the needle as `x` where 465 /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower 466 /// bound given here is always the period of `v`, which is `<= period(x)`. The 467 /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and 468 /// where `u` is a suffix of `v[0..period(v)]`. 469 /// 470 /// This case is important because the search algorithm for when the 471 /// periods are equivalent is slightly different than the search algorithm 472 /// for when the periods are not equivalent. In particular, when they aren't 473 /// equivalent, we know that the period of the needle is no less than half its 474 /// length. In this case, we shift by an amount less than or equal to the 475 /// period of the needle (determined by the maximum length of the components 476 /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. 477 /// 478 /// The above two cases are represented by the variants below. Each entails 479 /// a different instantiation of the Two-Way search algorithm. 480 /// 481 /// N.B. If we could find a way to compute the exact period in all cases, 482 /// then we could collapse this case analysis and simplify the algorithm. The 483 /// Two-Way paper suggests this is possible, but more reading is required to 484 /// grok why the authors didn't pursue that path. 485 #[derive(Clone, Debug)] 486 enum Shift { 487 Small { period: usize }, 488 Large { shift: usize }, 489 } 490 491 impl Shift { 492 /// Compute the shift for a given needle in the forward direction. 493 /// 494 /// This requires a lower bound on the period and a critical position. 495 /// These can be computed by extracting both the minimal and maximal 496 /// lexicographic suffixes, and choosing the right-most starting position. 497 /// The lower bound on the period is then the period of the chosen suffix. forward( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift498 fn forward( 499 needle: &[u8], 500 period_lower_bound: usize, 501 critical_pos: usize, 502 ) -> Shift { 503 let large = cmp::max(critical_pos, needle.len() - critical_pos); 504 if critical_pos * 2 >= needle.len() { 505 return Shift::Large { shift: large }; 506 } 507 508 let (u, v) = needle.split_at(critical_pos); 509 if !v[..period_lower_bound].ends_with(u) { 510 return Shift::Large { shift: large }; 511 } 512 Shift::Small { period: period_lower_bound } 513 } 514 515 /// Compute the shift for a given needle in the reverse direction. 516 /// 517 /// This requires a lower bound on the period and a critical position. 518 /// These can be computed by extracting both the minimal and maximal 519 /// lexicographic suffixes, and choosing the left-most starting position. 520 /// The lower bound on the period is then the period of the chosen suffix. reverse( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift521 fn reverse( 522 needle: &[u8], 523 period_lower_bound: usize, 524 critical_pos: usize, 525 ) -> Shift { 526 let large = cmp::max(critical_pos, needle.len() - critical_pos); 527 if (needle.len() - critical_pos) * 2 >= needle.len() { 528 return Shift::Large { shift: large }; 529 } 530 531 let (v, u) = needle.split_at(critical_pos); 532 if !v[v.len() - period_lower_bound..].starts_with(u) { 533 return Shift::Large { shift: large }; 534 } 535 Shift::Small { period: period_lower_bound } 536 } 537 } 538 539 /// A suffix extracted from a needle along with its period. 540 #[derive(Debug)] 541 struct Suffix { 542 /// The starting position of this suffix. 543 /// 544 /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this 545 /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for 546 /// forward suffixes, this is an inclusive starting position, where as for 547 /// reverse suffixes, this is an exclusive ending position. 548 pos: usize, 549 /// The period of this suffix. 550 /// 551 /// Note that this is NOT necessarily the period of the string from which 552 /// this suffix comes from. (It is always less than or equal to the period 553 /// of the original string.) 554 period: usize, 555 } 556 557 impl Suffix { forward(needle: &[u8], kind: SuffixKind) -> Suffix558 fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { 559 debug_assert!(!needle.is_empty()); 560 561 // suffix represents our maximal (or minimal) suffix, along with 562 // its period. 563 let mut suffix = Suffix { pos: 0, period: 1 }; 564 // The start of a suffix in `needle` that we are considering as a 565 // more maximal (or minimal) suffix than what's in `suffix`. 566 let mut candidate_start = 1; 567 // The current offset of our suffixes that we're comparing. 568 // 569 // When the characters at this offset are the same, then we mush on 570 // to the next position since no decision is possible. When the 571 // candidate's character is greater (or lesser) than the corresponding 572 // character than our current maximal (or minimal) suffix, then the 573 // current suffix is changed over to the candidate and we restart our 574 // search. Otherwise, the candidate suffix is no good and we restart 575 // our search on the next candidate. 576 // 577 // The three cases above correspond to the three cases in the loop 578 // below. 579 let mut offset = 0; 580 581 while candidate_start + offset < needle.len() { 582 let current = needle[suffix.pos + offset]; 583 let candidate = needle[candidate_start + offset]; 584 match kind.cmp(current, candidate) { 585 SuffixOrdering::Accept => { 586 suffix = Suffix { pos: candidate_start, period: 1 }; 587 candidate_start += 1; 588 offset = 0; 589 } 590 SuffixOrdering::Skip => { 591 candidate_start += offset + 1; 592 offset = 0; 593 suffix.period = candidate_start - suffix.pos; 594 } 595 SuffixOrdering::Push => { 596 if offset + 1 == suffix.period { 597 candidate_start += suffix.period; 598 offset = 0; 599 } else { 600 offset += 1; 601 } 602 } 603 } 604 } 605 suffix 606 } 607 reverse(needle: &[u8], kind: SuffixKind) -> Suffix608 fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { 609 debug_assert!(!needle.is_empty()); 610 611 // See the comments in `forward` for how this works. 612 let mut suffix = Suffix { pos: needle.len(), period: 1 }; 613 if needle.len() == 1 { 614 return suffix; 615 } 616 let mut candidate_start = needle.len() - 1; 617 let mut offset = 0; 618 619 while offset < candidate_start { 620 let current = needle[suffix.pos - offset - 1]; 621 let candidate = needle[candidate_start - offset - 1]; 622 match kind.cmp(current, candidate) { 623 SuffixOrdering::Accept => { 624 suffix = Suffix { pos: candidate_start, period: 1 }; 625 candidate_start -= 1; 626 offset = 0; 627 } 628 SuffixOrdering::Skip => { 629 candidate_start -= offset + 1; 630 offset = 0; 631 suffix.period = suffix.pos - candidate_start; 632 } 633 SuffixOrdering::Push => { 634 if offset + 1 == suffix.period { 635 candidate_start -= suffix.period; 636 offset = 0; 637 } else { 638 offset += 1; 639 } 640 } 641 } 642 } 643 suffix 644 } 645 } 646 647 /// The kind of suffix to extract. 648 #[derive(Clone, Copy, Debug)] 649 enum SuffixKind { 650 /// Extract the smallest lexicographic suffix from a string. 651 /// 652 /// Technically, this doesn't actually pick the smallest lexicographic 653 /// suffix. e.g., Given the choice between `a` and `aa`, this will choose 654 /// the latter over the former, even though `a < aa`. The reasoning for 655 /// this isn't clear from the paper, but it still smells like a minimal 656 /// suffix. 657 Minimal, 658 /// Extract the largest lexicographic suffix from a string. 659 /// 660 /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given 661 /// the choice between `z` and `zz`, this will choose the latter over the 662 /// former. 663 Maximal, 664 } 665 666 /// The result of comparing corresponding bytes between two suffixes. 667 #[derive(Clone, Copy, Debug)] 668 enum SuffixOrdering { 669 /// This occurs when the given candidate byte indicates that the candidate 670 /// suffix is better than the current maximal (or minimal) suffix. That is, 671 /// the current candidate suffix should supplant the current maximal (or 672 /// minimal) suffix. 673 Accept, 674 /// This occurs when the given candidate byte excludes the candidate suffix 675 /// from being better than the current maximal (or minimal) suffix. That 676 /// is, the current candidate suffix should be dropped and the next one 677 /// should be considered. 678 Skip, 679 /// This occurs when no decision to accept or skip the candidate suffix 680 /// can be made, e.g., when corresponding bytes are equivalent. In this 681 /// case, the next corresponding bytes should be compared. 682 Push, 683 } 684 685 impl SuffixKind { 686 /// Returns true if and only if the given candidate byte indicates that 687 /// it should replace the current suffix as the maximal (or minimal) 688 /// suffix. cmp(self, current: u8, candidate: u8) -> SuffixOrdering689 fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { 690 use self::SuffixOrdering::*; 691 692 match self { 693 SuffixKind::Minimal if candidate < current => Accept, 694 SuffixKind::Minimal if candidate > current => Skip, 695 SuffixKind::Minimal => Push, 696 SuffixKind::Maximal if candidate > current => Accept, 697 SuffixKind::Maximal if candidate < current => Skip, 698 SuffixKind::Maximal => Push, 699 } 700 } 701 } 702 703 // N.B. There are more holistic tests in src/search/tests.rs. 704 #[cfg(test)] 705 mod tests { 706 use super::*; 707 use ext_slice::B; 708 709 /// Convenience wrapper for computing the suffix as a byte string. get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)710 fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { 711 let s = Suffix::forward(needle, kind); 712 (&needle[s.pos..], s.period) 713 } 714 715 /// Convenience wrapper for computing the reverse suffix as a byte string. get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)716 fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { 717 let s = Suffix::reverse(needle, kind); 718 (&needle[..s.pos], s.period) 719 } 720 721 /// Return all of the non-empty suffixes in the given byte string. suffixes(bytes: &[u8]) -> Vec<&[u8]>722 fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { 723 (0..bytes.len()).map(|i| &bytes[i..]).collect() 724 } 725 726 /// Return the lexicographically maximal suffix of the given byte string. naive_maximal_suffix_forward(needle: &[u8]) -> &[u8]727 fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { 728 let mut sufs = suffixes(needle); 729 sufs.sort(); 730 sufs.pop().unwrap() 731 } 732 733 /// Return the lexicographically maximal suffix of the reverse of the given 734 /// byte string. naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8>735 fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { 736 let mut reversed = needle.to_vec(); 737 reversed.reverse(); 738 let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); 739 got.reverse(); 740 got 741 } 742 743 #[test] suffix_forward()744 fn suffix_forward() { 745 macro_rules! assert_suffix_min { 746 ($given:expr, $expected:expr, $period:expr) => { 747 let (got_suffix, got_period) = 748 get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); 749 assert_eq!((B($expected), $period), (got_suffix, got_period)); 750 }; 751 } 752 753 macro_rules! assert_suffix_max { 754 ($given:expr, $expected:expr, $period:expr) => { 755 let (got_suffix, got_period) = 756 get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); 757 assert_eq!((B($expected), $period), (got_suffix, got_period)); 758 }; 759 } 760 761 assert_suffix_min!("a", "a", 1); 762 assert_suffix_max!("a", "a", 1); 763 764 assert_suffix_min!("ab", "ab", 2); 765 assert_suffix_max!("ab", "b", 1); 766 767 assert_suffix_min!("ba", "a", 1); 768 assert_suffix_max!("ba", "ba", 2); 769 770 assert_suffix_min!("abc", "abc", 3); 771 assert_suffix_max!("abc", "c", 1); 772 773 assert_suffix_min!("acb", "acb", 3); 774 assert_suffix_max!("acb", "cb", 2); 775 776 assert_suffix_min!("cba", "a", 1); 777 assert_suffix_max!("cba", "cba", 3); 778 779 assert_suffix_min!("abcabc", "abcabc", 3); 780 assert_suffix_max!("abcabc", "cabc", 3); 781 782 assert_suffix_min!("abcabcabc", "abcabcabc", 3); 783 assert_suffix_max!("abcabcabc", "cabcabc", 3); 784 785 assert_suffix_min!("abczz", "abczz", 5); 786 assert_suffix_max!("abczz", "zz", 1); 787 788 assert_suffix_min!("zzabc", "abc", 3); 789 assert_suffix_max!("zzabc", "zzabc", 5); 790 791 assert_suffix_min!("aaa", "aaa", 1); 792 assert_suffix_max!("aaa", "aaa", 1); 793 794 assert_suffix_min!("foobar", "ar", 2); 795 assert_suffix_max!("foobar", "r", 1); 796 } 797 798 #[test] suffix_reverse()799 fn suffix_reverse() { 800 macro_rules! assert_suffix_min { 801 ($given:expr, $expected:expr, $period:expr) => { 802 let (got_suffix, got_period) = 803 get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); 804 assert_eq!((B($expected), $period), (got_suffix, got_period)); 805 }; 806 } 807 808 macro_rules! assert_suffix_max { 809 ($given:expr, $expected:expr, $period:expr) => { 810 let (got_suffix, got_period) = 811 get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); 812 assert_eq!((B($expected), $period), (got_suffix, got_period)); 813 }; 814 } 815 816 assert_suffix_min!("a", "a", 1); 817 assert_suffix_max!("a", "a", 1); 818 819 assert_suffix_min!("ab", "a", 1); 820 assert_suffix_max!("ab", "ab", 2); 821 822 assert_suffix_min!("ba", "ba", 2); 823 assert_suffix_max!("ba", "b", 1); 824 825 assert_suffix_min!("abc", "a", 1); 826 assert_suffix_max!("abc", "abc", 3); 827 828 assert_suffix_min!("acb", "a", 1); 829 assert_suffix_max!("acb", "ac", 2); 830 831 assert_suffix_min!("cba", "cba", 3); 832 assert_suffix_max!("cba", "c", 1); 833 834 assert_suffix_min!("abcabc", "abca", 3); 835 assert_suffix_max!("abcabc", "abcabc", 3); 836 837 assert_suffix_min!("abcabcabc", "abcabca", 3); 838 assert_suffix_max!("abcabcabc", "abcabcabc", 3); 839 840 assert_suffix_min!("abczz", "a", 1); 841 assert_suffix_max!("abczz", "abczz", 5); 842 843 assert_suffix_min!("zzabc", "zza", 3); 844 assert_suffix_max!("zzabc", "zz", 1); 845 846 assert_suffix_min!("aaa", "aaa", 1); 847 assert_suffix_max!("aaa", "aaa", 1); 848 } 849 850 quickcheck! { 851 fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { 852 if bytes.is_empty() { 853 return true; 854 } 855 856 let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); 857 let expected = naive_maximal_suffix_forward(&bytes); 858 got == expected 859 } 860 861 fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { 862 if bytes.is_empty() { 863 return true; 864 } 865 866 let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); 867 let expected = naive_maximal_suffix_reverse(&bytes); 868 expected == got 869 } 870 } 871 } 872