• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::iter::repeat;
2 
3 mod iter;
4 mod memchr;
5 
6 #[cfg(target_endian = "little")]
7 #[test]
byte_order()8 fn byte_order() {
9     eprintln!("LITTLE ENDIAN");
10 }
11 
12 #[cfg(target_endian = "big")]
13 #[test]
byte_order()14 fn byte_order() {
15     eprintln!("BIG ENDIAN");
16 }
17 
18 /// Create a sequence of tests that should be run by memchr implementations.
memchr_tests() -> Vec<MemchrTest>19 fn memchr_tests() -> Vec<MemchrTest> {
20     let mut tests = Vec::new();
21     for statict in MEMCHR_TESTS {
22         assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
23         assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
24         assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
25         assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
26 
27         let t = MemchrTest {
28             corpus: statict.corpus.to_string(),
29             needles: statict.needles.to_vec(),
30             positions: statict.positions.to_vec(),
31         };
32         tests.push(t.clone());
33         tests.extend(t.expand());
34     }
35     tests
36 }
37 
38 /// A set of tests for memchr-like functions.
39 ///
40 /// These tests mostly try to cover the short string cases. We cover the longer
41 /// string cases via the benchmarks (which are tests themselves), via
42 /// quickcheck tests and via automatic expansion of each test case (by
43 /// increasing the corpus size). Finally, we cover different alignment cases
44 /// in the tests by varying the starting point of the slice.
45 const MEMCHR_TESTS: &[MemchrTestStatic] = &[
46     // one needle (applied to memchr + memchr2 + memchr3)
47     MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
48     MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
49     MemchrTestStatic {
50         corpus: "aaa",
51         needles: &[b'a'],
52         positions: &[0, 1, 2],
53     },
54     MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
55     MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
56     MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
57     MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
58     MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
59     MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
60     MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
61     MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
62     MemchrTestStatic {
63         corpus: "\x00\x00",
64         needles: &[b'\x00'],
65         positions: &[0, 1],
66     },
67     MemchrTestStatic {
68         corpus: "\x00a\x00",
69         needles: &[b'\x00'],
70         positions: &[0, 2],
71     },
72     MemchrTestStatic {
73         corpus: "zzzzzzzzzzzzzzzza",
74         needles: &[b'a'],
75         positions: &[16],
76     },
77     MemchrTestStatic {
78         corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
79         needles: &[b'a'],
80         positions: &[32],
81     },
82     // two needles (applied to memchr2 + memchr3)
83     MemchrTestStatic {
84         corpus: "az",
85         needles: &[b'a', b'z'],
86         positions: &[0, 1],
87     },
88     MemchrTestStatic {
89         corpus: "az",
90         needles: &[b'a', b'z'],
91         positions: &[0, 1],
92     },
93     MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
94     MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
95     MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
96     MemchrTestStatic {
97         corpus: "yyyyaz",
98         needles: &[b'a', b'z'],
99         positions: &[4, 5],
100     },
101     MemchrTestStatic {
102         corpus: "yyyyaz",
103         needles: &[b'z', b'a'],
104         positions: &[4, 5],
105     },
106     // three needles (applied to memchr3)
107     MemchrTestStatic {
108         corpus: "xyz",
109         needles: &[b'x', b'y', b'z'],
110         positions: &[0, 1, 2],
111     },
112     MemchrTestStatic {
113         corpus: "zxy",
114         needles: &[b'x', b'y', b'z'],
115         positions: &[0, 1, 2],
116     },
117     MemchrTestStatic {
118         corpus: "zxy",
119         needles: &[b'x', b'a', b'z'],
120         positions: &[0, 1],
121     },
122     MemchrTestStatic {
123         corpus: "zxy",
124         needles: &[b't', b'a', b'z'],
125         positions: &[0],
126     },
127     MemchrTestStatic {
128         corpus: "yxz",
129         needles: &[b't', b'a', b'z'],
130         positions: &[2],
131     },
132 ];
133 
134 /// A description of a test on a memchr like function.
135 #[derive(Clone, Debug)]
136 struct MemchrTest {
137     /// The thing to search. We use `&str` instead of `&[u8]` because they
138     /// are nicer to write in tests, and we don't miss much since memchr
139     /// doesn't care about UTF-8.
140     ///
141     /// Corpora cannot contain either '%' or '#'. We use these bytes when
142     /// expanding test cases into many test cases, and we assume they are not
143     /// used. If they are used, `memchr_tests` will panic.
144     corpus: String,
145     /// The needles to search for. This is intended to be an "alternation" of
146     /// needles. The number of needles may cause this test to be skipped for
147     /// some memchr variants. For example, a test with 2 needles cannot be used
148     /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
149     /// However, a test with only 1 needle can be used to test all of `memchr`,
150     /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
151     /// bytes that we never used in the corpus (such as '#').
152     needles: Vec<u8>,
153     /// The positions expected to match for all of the needles.
154     positions: Vec<usize>,
155 }
156 
157 /// Like MemchrTest, but easier to define as a constant.
158 #[derive(Clone, Debug)]
159 struct MemchrTestStatic {
160     corpus: &'static str,
161     needles: &'static [u8],
162     positions: &'static [usize],
163 }
164 
165 impl MemchrTest {
one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)166     fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
167         let needles = match self.needles(1) {
168             None => return,
169             Some(needles) => needles,
170         };
171         // We test different alignments here. Since some implementations use
172         // AVX2, which can read 32 bytes at a time, we test at least that.
173         // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
174         // (avx) bytes at a time, so we include that in our offsets as well.
175         //
176         // You might think this would cause most needles to not be found, but
177         // we actually expand our tests to include corpus sizes all the way up
178         // to >500 bytes, so we should exericse most branches.
179         for align in 0..130 {
180             let corpus = self.corpus(align);
181             assert_eq!(
182                 self.positions(align, reverse).get(0).cloned(),
183                 f(needles[0], corpus.as_bytes()),
184                 "search for {:?} failed in: {:?} (len: {}, alignment: {})",
185                 needles[0] as char,
186                 corpus,
187                 corpus.len(),
188                 align
189             );
190         }
191     }
192 
two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)193     fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
194         let needles = match self.needles(2) {
195             None => return,
196             Some(needles) => needles,
197         };
198         for align in 0..130 {
199             let corpus = self.corpus(align);
200             assert_eq!(
201                 self.positions(align, reverse).get(0).cloned(),
202                 f(needles[0], needles[1], corpus.as_bytes()),
203                 "search for {:?}|{:?} failed in: {:?} \
204                  (len: {}, alignment: {})",
205                 needles[0] as char,
206                 needles[1] as char,
207                 corpus,
208                 corpus.len(),
209                 align
210             );
211         }
212     }
213 
three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )214     fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
215         &self,
216         reverse: bool,
217         f: F,
218     ) {
219         let needles = match self.needles(3) {
220             None => return,
221             Some(needles) => needles,
222         };
223         for align in 0..130 {
224             let corpus = self.corpus(align);
225             assert_eq!(
226                 self.positions(align, reverse).get(0).cloned(),
227                 f(needles[0], needles[1], needles[2], corpus.as_bytes()),
228                 "search for {:?}|{:?}|{:?} failed in: {:?} \
229                  (len: {}, alignment: {})",
230                 needles[0] as char,
231                 needles[1] as char,
232                 needles[2] as char,
233                 corpus,
234                 corpus.len(),
235                 align
236             );
237         }
238     }
239 
iter_one<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, &'a [u8]) -> I, I: Iterator<Item = usize>,240     fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
241     where
242         F: FnOnce(u8, &'a [u8]) -> I,
243         I: Iterator<Item = usize>,
244     {
245         if let Some(ns) = self.needles(1) {
246             self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
247         }
248     }
249 
iter_two<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,250     fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
251     where
252         F: FnOnce(u8, u8, &'a [u8]) -> I,
253         I: Iterator<Item = usize>,
254     {
255         if let Some(ns) = self.needles(2) {
256             self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
257         }
258     }
259 
iter_three<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,260     fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
261     where
262         F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
263         I: Iterator<Item = usize>,
264     {
265         if let Some(ns) = self.needles(3) {
266             self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
267         }
268     }
269 
270     /// Test that the positions yielded by the given iterator match the
271     /// positions in this test. If reverse is true, then reverse the positions
272     /// before comparing them.
iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I)273     fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
274         assert_eq!(
275             self.positions(0, reverse),
276             it.collect::<Vec<usize>>(),
277             r"search for {:?} failed in: {:?}",
278             self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
279             self.corpus
280         );
281     }
282 
283     /// Expand this test into many variations of the same test.
284     ///
285     /// In particular, this will generate more tests with larger corpus sizes.
286     /// The expected positions are updated to maintain the integrity of the
287     /// test.
288     ///
289     /// This is important in testing a memchr implementation, because there are
290     /// often different cases depending on the length of the corpus.
291     ///
292     /// Note that we extend the corpus by adding `%` bytes, which we
293     /// don't otherwise use as a needle.
expand(&self) -> Vec<MemchrTest>294     fn expand(&self) -> Vec<MemchrTest> {
295         let mut more = Vec::new();
296 
297         // Add bytes to the start of the corpus.
298         for i in 1..515 {
299             let mut t = self.clone();
300             let mut new_corpus: String = repeat('%').take(i).collect();
301             new_corpus.push_str(&t.corpus);
302             t.corpus = new_corpus;
303             t.positions = t.positions.into_iter().map(|p| p + i).collect();
304             more.push(t);
305         }
306         // Add bytes to the end of the corpus.
307         for i in 1..515 {
308             let mut t = self.clone();
309             let padding: String = repeat('%').take(i).collect();
310             t.corpus.push_str(&padding);
311             more.push(t);
312         }
313 
314         more
315     }
316 
317     /// Return the corpus at the given alignment.
318     ///
319     /// If the alignment exceeds the length of the corpus, then this returns
320     /// an empty slice.
corpus(&self, align: usize) -> &str321     fn corpus(&self, align: usize) -> &str {
322         self.corpus.get(align..).unwrap_or("")
323     }
324 
325     /// Return exactly `count` needles from this test. If this test has less
326     /// than `count` needles, then add `#` until the number of needles
327     /// matches `count`. If this test has more than `count` needles, then
328     /// return `None` (because there is no way to use this test data for a
329     /// search using fewer needles).
needles(&self, count: usize) -> Option<Vec<u8>>330     fn needles(&self, count: usize) -> Option<Vec<u8>> {
331         if self.needles.len() > count {
332             return None;
333         }
334 
335         let mut needles = self.needles.to_vec();
336         for _ in needles.len()..count {
337             // we assume # is never used in tests.
338             needles.push(b'#');
339         }
340         Some(needles)
341     }
342 
343     /// Return the positions in this test, reversed if `reverse` is true.
344     ///
345     /// If alignment is given, then all positions greater than or equal to that
346     /// alignment are offset by the alignment. Positions less than the
347     /// alignment are dropped.
positions(&self, align: usize, reverse: bool) -> Vec<usize>348     fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
349         let positions = if reverse {
350             let mut positions = self.positions.to_vec();
351             positions.reverse();
352             positions
353         } else {
354             self.positions.to_vec()
355         };
356         positions
357             .into_iter()
358             .filter(|&p| p >= align)
359             .map(|p| p - align)
360             .collect()
361     }
362 }
363