1 use std::collections::BTreeMap;
2 use std::env;
3 use std::fmt::{self, Write};
4 use std::thread;
5 
6 use regex;
7 use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
8 use serde_bytes;
9 use toml;
10 
11 macro_rules! load {
12     ($col:ident, $path:expr) => {
13         $col.extend(RegexTests::load(
14             concat!("../data/tests/", $path),
15             include_bytes!(concat!("../data/tests/", $path)),
16         ));
17     };
18 }
19 
20 lazy_static! {
21     pub static ref SUITE: RegexTestCollection = {
22         let mut col = RegexTestCollection::new();
23         load!(col, "fowler/basic.toml");
24         load!(col, "fowler/nullsubexpr.toml");
25         load!(col, "fowler/repetition.toml");
26         load!(col, "fowler/repetition-long.toml");
27         load!(col, "crazy.toml");
28         load!(col, "flags.toml");
29         load!(col, "iter.toml");
30         load!(col, "no-unicode.toml");
31         load!(col, "unicode.toml");
32         col
33     };
34 }
35 
36 #[derive(Clone, Debug)]
37 pub struct RegexTestCollection {
38     pub by_name: BTreeMap<String, RegexTest>,
39 }
40 
41 #[derive(Clone, Debug, Deserialize)]
42 pub struct RegexTests {
43     pub tests: Vec<RegexTest>,
44 }
45 
46 #[derive(Clone, Debug, Deserialize)]
47 pub struct RegexTest {
48     pub name: String,
49     #[serde(default)]
50     pub options: Vec<RegexTestOption>,
51     pub pattern: String,
52     #[serde(with = "serde_bytes")]
53     pub input: Vec<u8>,
54     #[serde(rename = "matches")]
55     pub matches: Vec<Match>,
56     #[serde(default)]
57     pub captures: Vec<Option<Match>>,
58     #[serde(default)]
59     pub fowler_line_number: Option<u64>,
60 }
61 
62 #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
63 #[serde(rename_all = "kebab-case")]
64 pub enum RegexTestOption {
65     Anchored,
66     CaseInsensitive,
67     NoUnicode,
68     Escaped,
69     #[serde(rename = "invalid-utf8")]
70     InvalidUTF8,
71 }
72 
73 #[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
74 pub struct Match {
75     pub start: usize,
76     pub end: usize,
77 }
78 
79 impl RegexTestCollection {
new() -> RegexTestCollection80     fn new() -> RegexTestCollection {
81         RegexTestCollection { by_name: BTreeMap::new() }
82     }
83 
extend(&mut self, tests: RegexTests)84     fn extend(&mut self, tests: RegexTests) {
85         for test in tests.tests {
86             let name = test.name.clone();
87             if self.by_name.contains_key(&name) {
88                 panic!("found duplicate test {}", name);
89             }
90             self.by_name.insert(name, test);
91         }
92     }
93 
tests(&self) -> Vec<&RegexTest>94     pub fn tests(&self) -> Vec<&RegexTest> {
95         self.by_name.values().collect()
96     }
97 }
98 
99 impl RegexTests {
load(path: &str, slice: &[u8]) -> RegexTests100     fn load(path: &str, slice: &[u8]) -> RegexTests {
101         let mut data: RegexTests = toml::from_slice(slice)
102             .expect(&format!("failed to load {}", path));
103         for test in &mut data.tests {
104             if test.options.contains(&RegexTestOption::Escaped) {
105                 test.input = unescape_bytes(&test.input);
106             }
107         }
108         data
109     }
110 }
111 
112 #[derive(Debug)]
113 pub struct RegexTester {
114     asserted: bool,
115     results: RegexTestResults,
116     skip_expensive: bool,
117     whitelist: Vec<regex::Regex>,
118     blacklist: Vec<regex::Regex>,
119 }
120 
121 impl Drop for RegexTester {
drop(&mut self)122     fn drop(&mut self) {
123         // If we haven't asserted yet, then the test is probably buggy, so
124         // fail it. But if we're already panicking (e.g., a bug in the regex
125         // engine), then don't double-panic, which causes an immediate abort.
126         if !thread::panicking() && !self.asserted {
127             panic!("must call RegexTester::assert at end of test");
128         }
129     }
130 }
131 
132 impl RegexTester {
new() -> RegexTester133     pub fn new() -> RegexTester {
134         let mut tester = RegexTester {
135             asserted: false,
136             results: RegexTestResults::default(),
137             skip_expensive: false,
138             whitelist: vec![],
139             blacklist: vec![],
140         };
141         for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
142             let x = x.trim();
143             if x.is_empty() {
144                 continue;
145             }
146             if x.starts_with("-") {
147                 tester = tester.blacklist(&x[1..]);
148             } else {
149                 tester = tester.whitelist(x);
150             }
151         }
152         tester
153     }
154 
skip_expensive(mut self) -> RegexTester155     pub fn skip_expensive(mut self) -> RegexTester {
156         self.skip_expensive = true;
157         self
158     }
159 
whitelist(mut self, name: &str) -> RegexTester160     pub fn whitelist(mut self, name: &str) -> RegexTester {
161         self.whitelist.push(regex::Regex::new(name).unwrap());
162         self
163     }
164 
blacklist(mut self, name: &str) -> RegexTester165     pub fn blacklist(mut self, name: &str) -> RegexTester {
166         self.blacklist.push(regex::Regex::new(name).unwrap());
167         self
168     }
169 
assert(&mut self)170     pub fn assert(&mut self) {
171         self.asserted = true;
172         self.results.assert();
173     }
174 
build_regex<S: StateID>( &self, mut builder: RegexBuilder, test: &RegexTest, ) -> Option<Regex<DenseDFA<Vec<S>, S>>>175     pub fn build_regex<S: StateID>(
176         &self,
177         mut builder: RegexBuilder,
178         test: &RegexTest,
179     ) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
180         if self.skip(test) {
181             return None;
182         }
183         self.apply_options(test, &mut builder);
184 
185         match builder.build_with_size::<S>(&test.pattern) {
186             Ok(re) => Some(re),
187             Err(err) => {
188                 if let ErrorKind::Unsupported(_) = *err.kind() {
189                     None
190                 } else {
191                     panic!(
192                         "failed to build {:?} with pattern '{:?}': {}",
193                         test.name, test.pattern, err
194                     );
195                 }
196             }
197         }
198     }
199 
test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I) where I: IntoIterator<IntoIter = T, Item = &'a RegexTest>, T: Iterator<Item = &'a RegexTest>,200     pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
201     where
202         I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
203         T: Iterator<Item = &'a RegexTest>,
204     {
205         for test in tests {
206             let builder = builder.clone();
207             let re: Regex = match self.build_regex(builder, test) {
208                 None => continue,
209                 Some(re) => re,
210             };
211             self.test(test, &re);
212         }
213     }
214 
test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>)215     pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
216         self.test_is_match(test, re);
217         self.test_find(test, re);
218         // Some tests (namely, fowler) are designed only to detect the
219         // first match even if there are more subsequent matches. To that
220         // end, we only test match iteration when the number of matches
221         // expected is not 1, or if the test name has 'iter' in it.
222         if test.name.contains("iter") || test.matches.len() != 1 {
223             self.test_find_iter(test, re);
224         }
225     }
226 
test_is_match<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex<D>, )227     pub fn test_is_match<'a, D: DFA>(
228         &mut self,
229         test: &RegexTest,
230         re: &Regex<D>,
231     ) {
232         self.asserted = false;
233 
234         let got = re.is_match(&test.input);
235         let expected = test.matches.len() >= 1;
236         if got == expected {
237             self.results.succeeded.push(test.clone());
238             return;
239         }
240         self.results.failed.push(RegexTestFailure {
241             test: test.clone(),
242             kind: RegexTestFailureKind::IsMatch,
243         });
244     }
245 
test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>)246     pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
247         self.asserted = false;
248 
249         let got =
250             re.find(&test.input).map(|(start, end)| Match { start, end });
251         if got == test.matches.get(0).map(|&m| m) {
252             self.results.succeeded.push(test.clone());
253             return;
254         }
255         self.results.failed.push(RegexTestFailure {
256             test: test.clone(),
257             kind: RegexTestFailureKind::Find { got },
258         });
259     }
260 
test_find_iter<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex<D>, )261     pub fn test_find_iter<'a, D: DFA>(
262         &mut self,
263         test: &RegexTest,
264         re: &Regex<D>,
265     ) {
266         self.asserted = false;
267 
268         let got: Vec<Match> = re
269             .find_iter(&test.input)
270             .map(|(start, end)| Match { start, end })
271             .collect();
272         if got == test.matches {
273             self.results.succeeded.push(test.clone());
274             return;
275         }
276         self.results.failed.push(RegexTestFailure {
277             test: test.clone(),
278             kind: RegexTestFailureKind::FindIter { got },
279         });
280     }
281 
skip(&self, test: &RegexTest) -> bool282     fn skip(&self, test: &RegexTest) -> bool {
283         if self.skip_expensive {
284             if test.name.starts_with("repetition-long") {
285                 return true;
286             }
287         }
288         if !self.blacklist.is_empty() {
289             if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
290                 return true;
291             }
292         }
293         if !self.whitelist.is_empty() {
294             if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
295                 return true;
296             }
297         }
298         false
299     }
300 
apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder)301     fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
302         for opt in &test.options {
303             match *opt {
304                 RegexTestOption::Anchored => {
305                     builder.anchored(true);
306                 }
307                 RegexTestOption::CaseInsensitive => {
308                     builder.case_insensitive(true);
309                 }
310                 RegexTestOption::NoUnicode => {
311                     builder.unicode(false);
312                 }
313                 RegexTestOption::Escaped => {}
314                 RegexTestOption::InvalidUTF8 => {
315                     builder.allow_invalid_utf8(true);
316                 }
317             }
318         }
319     }
320 }
321 
322 #[derive(Clone, Debug, Default)]
323 pub struct RegexTestResults {
324     /// Tests that succeeded.
325     pub succeeded: Vec<RegexTest>,
326     /// Failed tests, indexed by group name.
327     pub failed: Vec<RegexTestFailure>,
328 }
329 
330 #[derive(Clone, Debug)]
331 pub struct RegexTestFailure {
332     test: RegexTest,
333     kind: RegexTestFailureKind,
334 }
335 
336 #[derive(Clone, Debug)]
337 pub enum RegexTestFailureKind {
338     IsMatch,
339     Find { got: Option<Match> },
340     FindIter { got: Vec<Match> },
341 }
342 
343 impl RegexTestResults {
assert(&self)344     pub fn assert(&self) {
345         if self.failed.is_empty() {
346             return;
347         }
348         let failures = self
349             .failed
350             .iter()
351             .map(|f| f.to_string())
352             .collect::<Vec<String>>()
353             .join("\n\n");
354         panic!(
355             "found {} failures:\n{}\n{}\n{}\n\n\
356              Set the REGEX_TEST environment variable to filter tests, \n\
357              e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
358              whose name contains crazy-misc but not crazy-misc2\n\n",
359             self.failed.len(),
360             "~".repeat(79),
361             failures.trim(),
362             "~".repeat(79)
363         )
364     }
365 }
366 
367 impl fmt::Display for RegexTestFailure {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result368     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
369         write!(
370             f,
371             "{}: {}\n    \
372              options: {:?}\n    \
373              pattern: {}\n    \
374              pattern (escape): {}\n    \
375              input: {}\n    \
376              input (escape): {}\n    \
377              input (hex): {}",
378             self.test.name,
379             self.kind.fmt(&self.test)?,
380             self.test.options,
381             self.test.pattern,
382             escape_default(&self.test.pattern),
383             nice_raw_bytes(&self.test.input),
384             escape_bytes(&self.test.input),
385             hex_bytes(&self.test.input)
386         )
387     }
388 }
389 
390 impl RegexTestFailureKind {
fmt(&self, test: &RegexTest) -> Result<String, fmt::Error>391     fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
392         let mut buf = String::new();
393         match *self {
394             RegexTestFailureKind::IsMatch => {
395                 if let Some(&m) = test.matches.get(0) {
396                     write!(buf, "expected match (at {}), but none found", m)?
397                 } else {
398                     write!(buf, "expected no match, but found a match")?
399                 }
400             }
401             RegexTestFailureKind::Find { got } => write!(
402                 buf,
403                 "expected {:?}, but found {:?}",
404                 test.matches.get(0),
405                 got
406             )?,
407             RegexTestFailureKind::FindIter { ref got } => write!(
408                 buf,
409                 "expected {:?}, but found {:?}",
410                 test.matches, got
411             )?,
412         }
413         Ok(buf)
414     }
415 }
416 
417 impl fmt::Display for Match {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result418     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
419         write!(f, "({}, {})", self.start, self.end)
420     }
421 }
422 
423 impl fmt::Debug for Match {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result424     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
425         write!(f, "({}, {})", self.start, self.end)
426     }
427 }
428 
nice_raw_bytes(bytes: &[u8]) -> String429 fn nice_raw_bytes(bytes: &[u8]) -> String {
430     use std::str;
431 
432     match str::from_utf8(bytes) {
433         Ok(s) => s.to_string(),
434         Err(_) => escape_bytes(bytes),
435     }
436 }
437 
escape_bytes(bytes: &[u8]) -> String438 fn escape_bytes(bytes: &[u8]) -> String {
439     use std::ascii;
440 
441     let escaped = bytes
442         .iter()
443         .flat_map(|&b| ascii::escape_default(b))
444         .collect::<Vec<u8>>();
445     String::from_utf8(escaped).unwrap()
446 }
447 
hex_bytes(bytes: &[u8]) -> String448 fn hex_bytes(bytes: &[u8]) -> String {
449     bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
450 }
451 
escape_default(s: &str) -> String452 fn escape_default(s: &str) -> String {
453     s.chars().flat_map(|c| c.escape_default()).collect()
454 }
455 
unescape_bytes(bytes: &[u8]) -> Vec<u8>456 fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
457     use std::str;
458     use unescape::unescape;
459 
460     unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
461 }
462