1 use std::iter::repeat;
2
3 mod iter;
4 mod memchr;
5
6 #[cfg(target_endian = "little")]
7 #[test]
byte_order()8 fn byte_order() {
9 eprintln!("LITTLE ENDIAN");
10 }
11
12 #[cfg(target_endian = "big")]
13 #[test]
byte_order()14 fn byte_order() {
15 eprintln!("BIG ENDIAN");
16 }
17
18 /// Create a sequence of tests that should be run by memchr implementations.
memchr_tests() -> Vec<MemchrTest>19 fn memchr_tests() -> Vec<MemchrTest> {
20 let mut tests = Vec::new();
21 for statict in MEMCHR_TESTS {
22 assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
23 assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
24 assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
25 assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
26
27 let t = MemchrTest {
28 corpus: statict.corpus.to_string(),
29 needles: statict.needles.to_vec(),
30 positions: statict.positions.to_vec(),
31 };
32 tests.push(t.clone());
33 tests.extend(t.expand());
34 }
35 tests
36 }
37
38 /// A set of tests for memchr-like functions.
39 ///
40 /// These tests mostly try to cover the short string cases. We cover the longer
41 /// string cases via the benchmarks (which are tests themselves), via
42 /// quickcheck tests and via automatic expansion of each test case (by
43 /// increasing the corpus size). Finally, we cover different alignment cases
44 /// in the tests by varying the starting point of the slice.
45 const MEMCHR_TESTS: &[MemchrTestStatic] = &[
46 // one needle (applied to memchr + memchr2 + memchr3)
47 MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
48 MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
49 MemchrTestStatic {
50 corpus: "aaa",
51 needles: &[b'a'],
52 positions: &[0, 1, 2],
53 },
54 MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
55 MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
56 MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
57 MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
58 MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
59 MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
60 MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
61 MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
62 MemchrTestStatic {
63 corpus: "\x00\x00",
64 needles: &[b'\x00'],
65 positions: &[0, 1],
66 },
67 MemchrTestStatic {
68 corpus: "\x00a\x00",
69 needles: &[b'\x00'],
70 positions: &[0, 2],
71 },
72 MemchrTestStatic {
73 corpus: "zzzzzzzzzzzzzzzza",
74 needles: &[b'a'],
75 positions: &[16],
76 },
77 MemchrTestStatic {
78 corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
79 needles: &[b'a'],
80 positions: &[32],
81 },
82 // two needles (applied to memchr2 + memchr3)
83 MemchrTestStatic {
84 corpus: "az",
85 needles: &[b'a', b'z'],
86 positions: &[0, 1],
87 },
88 MemchrTestStatic {
89 corpus: "az",
90 needles: &[b'a', b'z'],
91 positions: &[0, 1],
92 },
93 MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
94 MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
95 MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
96 MemchrTestStatic {
97 corpus: "yyyyaz",
98 needles: &[b'a', b'z'],
99 positions: &[4, 5],
100 },
101 MemchrTestStatic {
102 corpus: "yyyyaz",
103 needles: &[b'z', b'a'],
104 positions: &[4, 5],
105 },
106 // three needles (applied to memchr3)
107 MemchrTestStatic {
108 corpus: "xyz",
109 needles: &[b'x', b'y', b'z'],
110 positions: &[0, 1, 2],
111 },
112 MemchrTestStatic {
113 corpus: "zxy",
114 needles: &[b'x', b'y', b'z'],
115 positions: &[0, 1, 2],
116 },
117 MemchrTestStatic {
118 corpus: "zxy",
119 needles: &[b'x', b'a', b'z'],
120 positions: &[0, 1],
121 },
122 MemchrTestStatic {
123 corpus: "zxy",
124 needles: &[b't', b'a', b'z'],
125 positions: &[0],
126 },
127 MemchrTestStatic {
128 corpus: "yxz",
129 needles: &[b't', b'a', b'z'],
130 positions: &[2],
131 },
132 ];
133
134 /// A description of a test on a memchr like function.
135 #[derive(Clone, Debug)]
136 struct MemchrTest {
137 /// The thing to search. We use `&str` instead of `&[u8]` because they
138 /// are nicer to write in tests, and we don't miss much since memchr
139 /// doesn't care about UTF-8.
140 ///
141 /// Corpora cannot contain either '%' or '#'. We use these bytes when
142 /// expanding test cases into many test cases, and we assume they are not
143 /// used. If they are used, `memchr_tests` will panic.
144 corpus: String,
145 /// The needles to search for. This is intended to be an "alternation" of
146 /// needles. The number of needles may cause this test to be skipped for
147 /// some memchr variants. For example, a test with 2 needles cannot be used
148 /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
149 /// However, a test with only 1 needle can be used to test all of `memchr`,
150 /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
151 /// bytes that we never used in the corpus (such as '#').
152 needles: Vec<u8>,
153 /// The positions expected to match for all of the needles.
154 positions: Vec<usize>,
155 }
156
157 /// Like MemchrTest, but easier to define as a constant.
158 #[derive(Clone, Debug)]
159 struct MemchrTestStatic {
160 corpus: &'static str,
161 needles: &'static [u8],
162 positions: &'static [usize],
163 }
164
165 impl MemchrTest {
one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)166 fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
167 let needles = match self.needles(1) {
168 None => return,
169 Some(needles) => needles,
170 };
171 // We test different alignments here. Since some implementations use
172 // AVX2, which can read 32 bytes at a time, we test at least that.
173 // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
174 // (avx) bytes at a time, so we include that in our offsets as well.
175 //
176 // You might think this would cause most needles to not be found, but
177 // we actually expand our tests to include corpus sizes all the way up
178 // to >500 bytes, so we should exericse most branches.
179 for align in 0..130 {
180 let corpus = self.corpus(align);
181 assert_eq!(
182 self.positions(align, reverse).get(0).cloned(),
183 f(needles[0], corpus.as_bytes()),
184 "search for {:?} failed in: {:?} (len: {}, alignment: {})",
185 needles[0] as char,
186 corpus,
187 corpus.len(),
188 align
189 );
190 }
191 }
192
two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)193 fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
194 let needles = match self.needles(2) {
195 None => return,
196 Some(needles) => needles,
197 };
198 for align in 0..130 {
199 let corpus = self.corpus(align);
200 assert_eq!(
201 self.positions(align, reverse).get(0).cloned(),
202 f(needles[0], needles[1], corpus.as_bytes()),
203 "search for {:?}|{:?} failed in: {:?} \
204 (len: {}, alignment: {})",
205 needles[0] as char,
206 needles[1] as char,
207 corpus,
208 corpus.len(),
209 align
210 );
211 }
212 }
213
three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )214 fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
215 &self,
216 reverse: bool,
217 f: F,
218 ) {
219 let needles = match self.needles(3) {
220 None => return,
221 Some(needles) => needles,
222 };
223 for align in 0..130 {
224 let corpus = self.corpus(align);
225 assert_eq!(
226 self.positions(align, reverse).get(0).cloned(),
227 f(needles[0], needles[1], needles[2], corpus.as_bytes()),
228 "search for {:?}|{:?}|{:?} failed in: {:?} \
229 (len: {}, alignment: {})",
230 needles[0] as char,
231 needles[1] as char,
232 needles[2] as char,
233 corpus,
234 corpus.len(),
235 align
236 );
237 }
238 }
239
iter_one<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, &'a [u8]) -> I, I: Iterator<Item = usize>,240 fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
241 where
242 F: FnOnce(u8, &'a [u8]) -> I,
243 I: Iterator<Item = usize>,
244 {
245 if let Some(ns) = self.needles(1) {
246 self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
247 }
248 }
249
iter_two<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,250 fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
251 where
252 F: FnOnce(u8, u8, &'a [u8]) -> I,
253 I: Iterator<Item = usize>,
254 {
255 if let Some(ns) = self.needles(2) {
256 self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
257 }
258 }
259
iter_three<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,260 fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
261 where
262 F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
263 I: Iterator<Item = usize>,
264 {
265 if let Some(ns) = self.needles(3) {
266 self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
267 }
268 }
269
270 /// Test that the positions yielded by the given iterator match the
271 /// positions in this test. If reverse is true, then reverse the positions
272 /// before comparing them.
iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I)273 fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
274 assert_eq!(
275 self.positions(0, reverse),
276 it.collect::<Vec<usize>>(),
277 r"search for {:?} failed in: {:?}",
278 self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
279 self.corpus
280 );
281 }
282
283 /// Expand this test into many variations of the same test.
284 ///
285 /// In particular, this will generate more tests with larger corpus sizes.
286 /// The expected positions are updated to maintain the integrity of the
287 /// test.
288 ///
289 /// This is important in testing a memchr implementation, because there are
290 /// often different cases depending on the length of the corpus.
291 ///
292 /// Note that we extend the corpus by adding `%` bytes, which we
293 /// don't otherwise use as a needle.
expand(&self) -> Vec<MemchrTest>294 fn expand(&self) -> Vec<MemchrTest> {
295 let mut more = Vec::new();
296
297 // Add bytes to the start of the corpus.
298 for i in 1..515 {
299 let mut t = self.clone();
300 let mut new_corpus: String = repeat('%').take(i).collect();
301 new_corpus.push_str(&t.corpus);
302 t.corpus = new_corpus;
303 t.positions = t.positions.into_iter().map(|p| p + i).collect();
304 more.push(t);
305 }
306 // Add bytes to the end of the corpus.
307 for i in 1..515 {
308 let mut t = self.clone();
309 let padding: String = repeat('%').take(i).collect();
310 t.corpus.push_str(&padding);
311 more.push(t);
312 }
313
314 more
315 }
316
317 /// Return the corpus at the given alignment.
318 ///
319 /// If the alignment exceeds the length of the corpus, then this returns
320 /// an empty slice.
corpus(&self, align: usize) -> &str321 fn corpus(&self, align: usize) -> &str {
322 self.corpus.get(align..).unwrap_or("")
323 }
324
325 /// Return exactly `count` needles from this test. If this test has less
326 /// than `count` needles, then add `#` until the number of needles
327 /// matches `count`. If this test has more than `count` needles, then
328 /// return `None` (because there is no way to use this test data for a
329 /// search using fewer needles).
needles(&self, count: usize) -> Option<Vec<u8>>330 fn needles(&self, count: usize) -> Option<Vec<u8>> {
331 if self.needles.len() > count {
332 return None;
333 }
334
335 let mut needles = self.needles.to_vec();
336 for _ in needles.len()..count {
337 // we assume # is never used in tests.
338 needles.push(b'#');
339 }
340 Some(needles)
341 }
342
343 /// Return the positions in this test, reversed if `reverse` is true.
344 ///
345 /// If alignment is given, then all positions greater than or equal to that
346 /// alignment are offset by the alignment. Positions less than the
347 /// alignment are dropped.
positions(&self, align: usize, reverse: bool) -> Vec<usize>348 fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
349 let positions = if reverse {
350 let mut positions = self.positions.to_vec();
351 positions.reverse();
352 positions
353 } else {
354 self.positions.to_vec()
355 };
356 positions
357 .into_iter()
358 .filter(|&p| p >= align)
359 .map(|p| p - align)
360 .collect()
361 }
362 }
363