1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Chris Wilson <chris@chris-wilson.co.uk>
25  *
26  */
27 
28 #include "igt.h"
29 #include "igt_device.h"
30 #include "igt_rand.h"
31 #include "igt_sysfs.h"
32 
33 #include <unistd.h>
34 #include <stdlib.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include <fcntl.h>
39 #include <inttypes.h>
40 #include <errno.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/poll.h>
44 #include <sys/time.h>
45 #include <time.h>
46 #include "drm.h"
47 
48 #define LOCAL_I915_EXEC_NO_RELOC (1<<11)
49 #define LOCAL_I915_EXEC_HANDLE_LUT (1<<12)
50 
51 #define LOCAL_I915_EXEC_BSD_SHIFT      (13)
52 #define LOCAL_I915_EXEC_BSD_MASK       (3 << LOCAL_I915_EXEC_BSD_SHIFT)
53 
54 #define ENGINE_FLAGS  (I915_EXEC_RING_MASK | LOCAL_I915_EXEC_BSD_MASK)
55 
56 #define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
57 #define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
58 
59 #define FORKED 1
60 #define CHAINED 2
61 #define CONTEXT 4
62 
elapsed(const struct timespec * start,const struct timespec * end)63 static double elapsed(const struct timespec *start, const struct timespec *end)
64 {
65 	return ((end->tv_sec - start->tv_sec) +
66 		(end->tv_nsec - start->tv_nsec)*1e-9);
67 }
68 
nop_on_ring(int fd,uint32_t handle,unsigned ring_id,int timeout,unsigned long * out)69 static double nop_on_ring(int fd, uint32_t handle, unsigned ring_id,
70 			  int timeout, unsigned long *out)
71 {
72 	struct drm_i915_gem_execbuffer2 execbuf;
73 	struct drm_i915_gem_exec_object2 obj;
74 	struct timespec start, now;
75 	unsigned long count;
76 
77 	memset(&obj, 0, sizeof(obj));
78 	obj.handle = handle;
79 
80 	memset(&execbuf, 0, sizeof(execbuf));
81 	execbuf.buffers_ptr = to_user_pointer(&obj);
82 	execbuf.buffer_count = 1;
83 	execbuf.flags = ring_id;
84 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
85 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
86 	if (__gem_execbuf(fd, &execbuf)) {
87 		execbuf.flags = ring_id;
88 		gem_execbuf(fd, &execbuf);
89 	}
90 	intel_detect_and_clear_missed_interrupts(fd);
91 
92 	count = 0;
93 	clock_gettime(CLOCK_MONOTONIC, &start);
94 	do {
95 		for (int loop = 0; loop < 1024; loop++)
96 			gem_execbuf(fd, &execbuf);
97 
98 		count += 1024;
99 		clock_gettime(CLOCK_MONOTONIC, &now);
100 	} while (elapsed(&start, &now) < timeout);
101 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
102 
103 	*out = count;
104 	return elapsed(&start, &now);
105 }
106 
poll_ring(int fd,unsigned engine,const char * name,int timeout)107 static void poll_ring(int fd, unsigned engine, const char *name, int timeout)
108 {
109 	const int gen = intel_gen(intel_get_drm_devid(fd));
110 	const uint32_t MI_ARB_CHK = 0x5 << 23;
111 	struct drm_i915_gem_execbuffer2 execbuf;
112 	struct drm_i915_gem_exec_object2 obj;
113 	struct drm_i915_gem_relocation_entry reloc[4], *r;
114 	uint32_t *bbe[2], *state, *batch;
115 	struct timespec tv = {};
116 	unsigned long cycles;
117 	unsigned flags;
118 	uint64_t elapsed;
119 
120 	flags = I915_EXEC_NO_RELOC;
121 	if (gen == 4 || gen == 5)
122 		flags |= I915_EXEC_SECURE;
123 
124 	gem_require_ring(fd, engine);
125 	igt_require(gem_can_store_dword(fd, engine));
126 
127 	memset(&obj, 0, sizeof(obj));
128 	obj.handle = gem_create(fd, 4096);
129 	obj.relocs_ptr = to_user_pointer(reloc);
130 	obj.relocation_count = ARRAY_SIZE(reloc);
131 
132 	r = memset(reloc, 0, sizeof(reloc));
133 	batch = gem_mmap__wc(fd, obj.handle, 0, 4096, PROT_WRITE);
134 
135 	for (unsigned int start_offset = 0;
136 	     start_offset <= 128;
137 	     start_offset += 128) {
138 		uint32_t *b = batch + start_offset / sizeof(*batch);
139 
140 		r->target_handle = obj.handle;
141 		r->offset = (b - batch + 1) * sizeof(uint32_t);
142 		r->delta = 4092;
143 		r->read_domains = I915_GEM_DOMAIN_RENDER;
144 
145 		*b = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
146 		if (gen >= 8) {
147 			*++b = r->delta;
148 			*++b = 0;
149 		} else if (gen >= 4) {
150 			r->offset += sizeof(uint32_t);
151 			*++b = 0;
152 			*++b = r->delta;
153 		} else {
154 			*b -= 1;
155 			*++b = r->delta;
156 		}
157 		*++b = start_offset != 0;
158 		r++;
159 
160 		b = batch + (start_offset + 64) / sizeof(*batch);
161 		bbe[start_offset != 0] = b;
162 		*b++ = MI_ARB_CHK;
163 
164 		r->target_handle = obj.handle;
165 		r->offset = (b - batch + 1) * sizeof(uint32_t);
166 		r->read_domains = I915_GEM_DOMAIN_COMMAND;
167 		r->delta = start_offset + 64;
168 		if (gen >= 8) {
169 			*b++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
170 			*b++ = r->delta;
171 			*b++ = 0;
172 		} else if (gen >= 6) {
173 			*b++ = MI_BATCH_BUFFER_START | 1 << 8;
174 			*b++ = r->delta;
175 		} else {
176 			*b++ = MI_BATCH_BUFFER_START | 2 << 6;
177 			if (gen < 4)
178 				r->delta |= 1;
179 			*b++ = r->delta;
180 		}
181 		r++;
182 	}
183 	igt_assert(r == reloc + ARRAY_SIZE(reloc));
184 	state = batch + 1023;
185 
186 	memset(&execbuf, 0, sizeof(execbuf));
187 	execbuf.buffers_ptr = to_user_pointer(&obj);
188 	execbuf.buffer_count = 1;
189 	execbuf.flags = engine | flags;
190 
191 	cycles = 0;
192 	do {
193 		unsigned int idx = ++cycles & 1;
194 
195 		*bbe[idx] = MI_ARB_CHK;
196 		execbuf.batch_start_offset =
197 			(bbe[idx] - batch) * sizeof(*batch) - 64;
198 
199 		gem_execbuf(fd, &execbuf);
200 
201 		*bbe[!idx] = MI_BATCH_BUFFER_END;
202 		__sync_synchronize();
203 
204 		while (READ_ONCE(*state) != idx)
205 			;
206 	} while ((elapsed = igt_nsec_elapsed(&tv)) >> 30 < timeout);
207 	*bbe[cycles & 1] = MI_BATCH_BUFFER_END;
208 	gem_sync(fd, obj.handle);
209 
210 	igt_info("%s completed %ld cycles: %.3f us\n",
211 		 name, cycles, elapsed*1e-3/cycles);
212 
213 	munmap(batch, 4096);
214 	gem_close(fd, obj.handle);
215 }
216 
poll_sequential(int fd,const char * name,int timeout)217 static void poll_sequential(int fd, const char *name, int timeout)
218 {
219 	const int gen = intel_gen(intel_get_drm_devid(fd));
220 	const uint32_t MI_ARB_CHK = 0x5 << 23;
221 	struct drm_i915_gem_execbuffer2 execbuf;
222 	struct drm_i915_gem_exec_object2 obj[2];
223 	struct drm_i915_gem_relocation_entry reloc[4], *r;
224 	uint32_t *bbe[2], *state, *batch;
225 	unsigned engines[16], nengine, engine, flags;
226 	struct timespec tv = {};
227 	unsigned long cycles;
228 	uint64_t elapsed;
229 	bool cached;
230 
231 	flags = I915_EXEC_NO_RELOC;
232 	if (gen == 4 || gen == 5)
233 		flags |= I915_EXEC_SECURE;
234 
235 	nengine = 0;
236 	for_each_physical_engine(fd, engine) {
237 		if (!gem_can_store_dword(fd, engine))
238 			continue;
239 
240 		engines[nengine++] = engine;
241 	}
242 	igt_require(nengine);
243 
244 	memset(obj, 0, sizeof(obj));
245 	obj[0].handle = gem_create(fd, 4096);
246 	obj[0].flags = EXEC_OBJECT_WRITE;
247 	cached = __gem_set_caching(fd, obj[0].handle, 1) == 0;
248 	obj[1].handle = gem_create(fd, 4096);
249 	obj[1].relocs_ptr = to_user_pointer(reloc);
250 	obj[1].relocation_count = ARRAY_SIZE(reloc);
251 
252 	r = memset(reloc, 0, sizeof(reloc));
253 	batch = gem_mmap__wc(fd, obj[1].handle, 0, 4096, PROT_WRITE);
254 
255 	for (unsigned int start_offset = 0;
256 	     start_offset <= 128;
257 	     start_offset += 128) {
258 		uint32_t *b = batch + start_offset / sizeof(*batch);
259 
260 		r->target_handle = obj[0].handle;
261 		r->offset = (b - batch + 1) * sizeof(uint32_t);
262 		r->delta = 0;
263 		r->read_domains = I915_GEM_DOMAIN_RENDER;
264 		r->write_domain = I915_GEM_DOMAIN_RENDER;
265 
266 		*b = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);
267 		if (gen >= 8) {
268 			*++b = r->delta;
269 			*++b = 0;
270 		} else if (gen >= 4) {
271 			r->offset += sizeof(uint32_t);
272 			*++b = 0;
273 			*++b = r->delta;
274 		} else {
275 			*b -= 1;
276 			*++b = r->delta;
277 		}
278 		*++b = start_offset != 0;
279 		r++;
280 
281 		b = batch + (start_offset + 64) / sizeof(*batch);
282 		bbe[start_offset != 0] = b;
283 		*b++ = MI_ARB_CHK;
284 
285 		r->target_handle = obj[1].handle;
286 		r->offset = (b - batch + 1) * sizeof(uint32_t);
287 		r->read_domains = I915_GEM_DOMAIN_COMMAND;
288 		r->delta = start_offset + 64;
289 		if (gen >= 8) {
290 			*b++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
291 			*b++ = r->delta;
292 			*b++ = 0;
293 		} else if (gen >= 6) {
294 			*b++ = MI_BATCH_BUFFER_START | 1 << 8;
295 			*b++ = r->delta;
296 		} else {
297 			*b++ = MI_BATCH_BUFFER_START | 2 << 6;
298 			if (gen < 4)
299 				r->delta |= 1;
300 			*b++ = r->delta;
301 		}
302 		r++;
303 	}
304 	igt_assert(r == reloc + ARRAY_SIZE(reloc));
305 
306 	if (cached)
307 		state = gem_mmap__cpu(fd, obj[0].handle, 0, 4096, PROT_READ);
308 	else
309 		state = gem_mmap__wc(fd, obj[0].handle, 0, 4096, PROT_READ);
310 
311 	memset(&execbuf, 0, sizeof(execbuf));
312 	execbuf.buffers_ptr = to_user_pointer(obj);
313 	execbuf.buffer_count = ARRAY_SIZE(obj);
314 
315 	cycles = 0;
316 	do {
317 		unsigned int idx = ++cycles & 1;
318 
319 		*bbe[idx] = MI_ARB_CHK;
320 		execbuf.batch_start_offset =
321 			(bbe[idx] - batch) * sizeof(*batch) - 64;
322 
323 		execbuf.flags = engines[cycles % nengine] | flags;
324 		gem_execbuf(fd, &execbuf);
325 
326 		*bbe[!idx] = MI_BATCH_BUFFER_END;
327 		__sync_synchronize();
328 
329 		while (READ_ONCE(*state) != idx)
330 			;
331 	} while ((elapsed = igt_nsec_elapsed(&tv)) >> 30 < timeout);
332 	*bbe[cycles & 1] = MI_BATCH_BUFFER_END;
333 	gem_sync(fd, obj[1].handle);
334 
335 	igt_info("%s completed %ld cycles: %.3f us\n",
336 		 name, cycles, elapsed*1e-3/cycles);
337 
338 	munmap(state, 4096);
339 	munmap(batch, 4096);
340 	gem_close(fd, obj[1].handle);
341 	gem_close(fd, obj[0].handle);
342 }
343 
single(int fd,uint32_t handle,unsigned ring_id,const char * ring_name)344 static void single(int fd, uint32_t handle,
345 		   unsigned ring_id, const char *ring_name)
346 {
347 	double time;
348 	unsigned long count;
349 
350 	gem_require_ring(fd, ring_id);
351 
352 	time = nop_on_ring(fd, handle, ring_id, 20, &count);
353 	igt_info("%s: %'lu cycles: %.3fus\n",
354 		 ring_name, count, time*1e6 / count);
355 }
356 
357 static double
stable_nop_on_ring(int fd,uint32_t handle,unsigned int engine,int timeout,int reps)358 stable_nop_on_ring(int fd, uint32_t handle, unsigned int engine,
359 		   int timeout, int reps)
360 {
361 	igt_stats_t s;
362 	double n;
363 
364 	igt_assert(reps >= 5);
365 
366 	igt_stats_init_with_size(&s, reps);
367 	s.is_float = true;
368 
369 	while (reps--) {
370 		unsigned long count;
371 		double time;
372 
373 		time = nop_on_ring(fd, handle, engine, timeout, &count);
374 		igt_stats_push_float(&s, time / count);
375 	}
376 
377 	n = igt_stats_get_median(&s);
378 	igt_stats_fini(&s);
379 
380 	return n;
381 }
382 
383 #define assert_within_epsilon(x, ref, tolerance) \
384         igt_assert_f((x) <= (1.0 + tolerance) * ref && \
385                      (x) >= (1.0 - tolerance) * ref, \
386                      "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
387                      #x, #ref, x, tolerance * 100.0, ref)
388 
headless(int fd,uint32_t handle)389 static void headless(int fd, uint32_t handle)
390 {
391 	unsigned int nr_connected = 0;
392 	drmModeConnector *connector;
393 	drmModeRes *res;
394 	double n_display, n_headless;
395 
396 	res = drmModeGetResources(fd);
397 	igt_require(res);
398 
399 	/* require at least one connected connector for the test */
400 	for (int i = 0; i < res->count_connectors; i++) {
401 		connector = drmModeGetConnectorCurrent(fd, res->connectors[i]);
402 		if (connector->connection == DRM_MODE_CONNECTED)
403 			nr_connected++;
404 		drmModeFreeConnector(connector);
405 	}
406 	igt_require(nr_connected > 0);
407 
408 	/* set graphics mode to prevent blanking */
409 	kmstest_set_vt_graphics_mode();
410 
411 	/* benchmark nops */
412 	n_display = stable_nop_on_ring(fd, handle, I915_EXEC_DEFAULT, 1, 5);
413 	igt_info("With one display connected: %.2fus\n",
414 		 n_display * 1e6);
415 
416 	/* force all connectors off */
417 	kmstest_unset_all_crtcs(fd, res);
418 
419 	/* benchmark nops again */
420 	n_headless = stable_nop_on_ring(fd, handle, I915_EXEC_DEFAULT, 1, 5);
421 	igt_info("Without a display connected (headless): %.2fus\n",
422 		 n_headless * 1e6);
423 
424 	/* check that the two execution speeds are roughly the same */
425 	assert_within_epsilon(n_headless, n_display, 0.1f);
426 }
427 
parallel(int fd,uint32_t handle,int timeout)428 static void parallel(int fd, uint32_t handle, int timeout)
429 {
430 	struct drm_i915_gem_execbuffer2 execbuf;
431 	struct drm_i915_gem_exec_object2 obj;
432 	unsigned engines[16];
433 	const char *names[16];
434 	unsigned nengine;
435 	unsigned engine;
436 	unsigned long count;
437 	double time, sum;
438 
439 	sum = 0;
440 	nengine = 0;
441 	for_each_physical_engine(fd, engine) {
442 		engines[nengine] = engine;
443 		names[nengine] = e__->name;
444 		nengine++;
445 
446 		time = nop_on_ring(fd, handle, engine, 1, &count) / count;
447 		sum += time;
448 		igt_debug("%s: %.3fus\n", e__->name, 1e6*time);
449 	}
450 	igt_require(nengine);
451 	igt_info("average (individually): %.3fus\n", sum/nengine*1e6);
452 
453 	memset(&obj, 0, sizeof(obj));
454 	obj.handle = handle;
455 
456 	memset(&execbuf, 0, sizeof(execbuf));
457 	execbuf.buffers_ptr = to_user_pointer(&obj);
458 	execbuf.buffer_count = 1;
459 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
460 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
461 	if (__gem_execbuf(fd, &execbuf)) {
462 		execbuf.flags = 0;
463 		gem_execbuf(fd, &execbuf);
464 	}
465 	intel_detect_and_clear_missed_interrupts(fd);
466 
467 	igt_fork(child, nengine) {
468 		struct timespec start, now;
469 
470 		execbuf.flags &= ~ENGINE_FLAGS;
471 		execbuf.flags |= engines[child];
472 
473 		count = 0;
474 		clock_gettime(CLOCK_MONOTONIC, &start);
475 		do {
476 			for (int loop = 0; loop < 1024; loop++)
477 				gem_execbuf(fd, &execbuf);
478 			count += 1024;
479 			clock_gettime(CLOCK_MONOTONIC, &now);
480 		} while (elapsed(&start, &now) < timeout);
481 		time = elapsed(&start, &now) / count;
482 		igt_info("%s: %ld cycles, %.3fus\n", names[child], count, 1e6*time);
483 	}
484 
485 	igt_waitchildren();
486 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
487 
488 }
489 
series(int fd,uint32_t handle,int timeout)490 static void series(int fd, uint32_t handle, int timeout)
491 {
492 	struct drm_i915_gem_execbuffer2 execbuf;
493 	struct drm_i915_gem_exec_object2 obj;
494 	struct timespec start, now, sync;
495 	unsigned engines[16];
496 	unsigned nengine;
497 	unsigned engine;
498 	unsigned long count;
499 	double time, max = 0, min = HUGE_VAL, sum = 0;
500 	const char *name;
501 
502 	nengine = 0;
503 	for_each_physical_engine(fd, engine) {
504 		time = nop_on_ring(fd, handle, engine, 1, &count) / count;
505 		if (time > max) {
506 			name = e__->name;
507 			max = time;
508 		}
509 		if (time < min)
510 			min = time;
511 		sum += time;
512 		engines[nengine++] = engine;
513 	}
514 	igt_require(nengine);
515 	igt_info("Maximum execution latency on %s, %.3fus, min %.3fus, total %.3fus per cycle, average %.3fus\n",
516 		 name, max*1e6, min*1e6, sum*1e6, sum/nengine*1e6);
517 
518 	memset(&obj, 0, sizeof(obj));
519 	obj.handle = handle;
520 
521 	memset(&execbuf, 0, sizeof(execbuf));
522 	execbuf.buffers_ptr = to_user_pointer(&obj);
523 	execbuf.buffer_count = 1;
524 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
525 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
526 	if (__gem_execbuf(fd, &execbuf)) {
527 		execbuf.flags = 0;
528 		gem_execbuf(fd, &execbuf);
529 	}
530 	intel_detect_and_clear_missed_interrupts(fd);
531 
532 	count = 0;
533 	clock_gettime(CLOCK_MONOTONIC, &start);
534 	do {
535 		for (int loop = 0; loop < 1024; loop++) {
536 			for (int n = 0; n < nengine; n++) {
537 				execbuf.flags &= ~ENGINE_FLAGS;
538 				execbuf.flags |= engines[n];
539 				gem_execbuf(fd, &execbuf);
540 			}
541 		}
542 		count += nengine * 1024;
543 		clock_gettime(CLOCK_MONOTONIC, &now);
544 	} while (elapsed(&start, &now) < timeout); /* Hang detection ~120s */
545 	gem_sync(fd, handle);
546 	clock_gettime(CLOCK_MONOTONIC, &sync);
547 	igt_debug("sync time: %.3fus\n", elapsed(&now, &sync)*1e6);
548 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
549 
550 	time = elapsed(&start, &now) / count;
551 	igt_info("All (%d engines): %'lu cycles, average %.3fus per cycle [expected %.3fus]\n",
552 		 nengine, count, 1e6*time, 1e6*((max-min)/nengine+min));
553 
554 	/* The rate limiting step should be how fast the slowest engine can
555 	 * execute its queue of requests, as when we wait upon a full ring all
556 	 * dispatch is frozen. So in general we cannot go faster than the
557 	 * slowest engine (but as all engines are in lockstep, they should all
558 	 * be executing in parallel and so the average should be max/nengines),
559 	 * but we should equally not go any slower.
560 	 *
561 	 * However, that depends upon being able to submit fast enough, and
562 	 * that in turns depends upon debugging turned off and no bottlenecks
563 	 * within the driver. We cannot assert that we hit ideal conditions
564 	 * across all engines, so we only look for an outrageous error
565 	 * condition.
566 	 */
567 	igt_assert_f(time < 2*sum,
568 		     "Average time (%.3fus) exceeds expectation for parallel execution (min %.3fus, max %.3fus; limit set at %.3fus)\n",
569 		     1e6*time, 1e6*min, 1e6*max, 1e6*sum*2);
570 }
571 
xchg(void * array,unsigned i,unsigned j)572 static void xchg(void *array, unsigned i, unsigned j)
573 {
574 	unsigned *u = array;
575 	unsigned tmp = u[i];
576 	u[i] = u[j];
577 	u[j] = tmp;
578 }
579 
sequential(int fd,uint32_t handle,unsigned flags,int timeout)580 static void sequential(int fd, uint32_t handle, unsigned flags, int timeout)
581 {
582 	const int ncpus = flags & FORKED ? sysconf(_SC_NPROCESSORS_ONLN) : 1;
583 	struct drm_i915_gem_execbuffer2 execbuf;
584 	struct drm_i915_gem_exec_object2 obj[2];
585 	unsigned engines[16];
586 	unsigned nengine;
587 	double *results;
588 	double time, sum;
589 	unsigned n;
590 
591 	gem_require_contexts(fd);
592 
593 	results = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
594 	igt_assert(results != MAP_FAILED);
595 
596 	nengine = 0;
597 	sum = 0;
598 	for_each_physical_engine(fd, n) {
599 		unsigned long count;
600 
601 		time = nop_on_ring(fd, handle, n, 1, &count) / count;
602 		sum += time;
603 		igt_debug("%s: %.3fus\n", e__->name, 1e6*time);
604 
605 		engines[nengine++] = n;
606 	}
607 	igt_require(nengine);
608 	igt_info("Total (individual) execution latency %.3fus per cycle\n",
609 		 1e6*sum);
610 
611 	memset(obj, 0, sizeof(obj));
612 	obj[0].handle = gem_create(fd, 4096);
613 	obj[0].flags = EXEC_OBJECT_WRITE;
614 	obj[1].handle = handle;
615 
616 	memset(&execbuf, 0, sizeof(execbuf));
617 	execbuf.buffers_ptr = to_user_pointer(obj);
618 	execbuf.buffer_count = 2;
619 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
620 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
621 	igt_require(__gem_execbuf(fd, &execbuf) == 0);
622 
623 	if (flags & CONTEXT) {
624 		uint32_t id;
625 
626 		igt_require(__gem_context_create(fd, &id) == 0);
627 		execbuf.rsvd1 = id;
628 	}
629 
630 	for (n = 0; n < nengine; n++) {
631 		execbuf.flags &= ~ENGINE_FLAGS;
632 		execbuf.flags |= engines[n];
633 		igt_require(__gem_execbuf(fd, &execbuf) == 0);
634 	}
635 
636 	intel_detect_and_clear_missed_interrupts(fd);
637 
638 	igt_fork(child, ncpus) {
639 		struct timespec start, now;
640 		unsigned long count;
641 
642 		obj[0].handle = gem_create(fd, 4096);
643 		gem_execbuf(fd, &execbuf);
644 
645 		if (flags & CONTEXT)
646 			execbuf.rsvd1 = gem_context_create(fd);
647 
648 		hars_petruska_f54_1_random_perturb(child);
649 
650 		count = 0;
651 		clock_gettime(CLOCK_MONOTONIC, &start);
652 		do {
653 			igt_permute_array(engines, nengine, xchg);
654 			if (flags & CHAINED) {
655 				for (n = 0; n < nengine; n++) {
656 					execbuf.flags &= ~ENGINE_FLAGS;
657 					execbuf.flags |= engines[n];
658 					for (int loop = 0; loop < 1024; loop++)
659 						gem_execbuf(fd, &execbuf);
660 				}
661 			} else {
662 				for (int loop = 0; loop < 1024; loop++) {
663 					for (n = 0; n < nengine; n++) {
664 						execbuf.flags &= ~ENGINE_FLAGS;
665 						execbuf.flags |= engines[n];
666 						gem_execbuf(fd, &execbuf);
667 					}
668 				}
669 			}
670 			count += 1024;
671 			clock_gettime(CLOCK_MONOTONIC, &now);
672 		} while (elapsed(&start, &now) < timeout); /* Hang detection ~120s */
673 
674 		gem_sync(fd, obj[0].handle);
675 		clock_gettime(CLOCK_MONOTONIC, &now);
676 		results[child] = elapsed(&start, &now) / count;
677 
678 		if (flags & CONTEXT)
679 			gem_context_destroy(fd, execbuf.rsvd1);
680 
681 		gem_close(fd, obj[0].handle);
682 	}
683 	igt_waitchildren();
684 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
685 
686 	results[ncpus] = 0;
687 	for (n = 0; n < ncpus; n++)
688 		results[ncpus] += results[n];
689 	results[ncpus] /= ncpus;
690 
691 	igt_info("Sequential (%d engines, %d processes): average %.3fus per cycle [expected %.3fus]\n",
692 		 nengine, ncpus, 1e6*results[ncpus], 1e6*sum*ncpus);
693 
694 	if (flags & CONTEXT)
695 		gem_context_destroy(fd, execbuf.rsvd1);
696 
697 	gem_close(fd, obj[0].handle);
698 	munmap(results, 4096);
699 }
700 
701 #define LOCAL_EXEC_FENCE_OUT (1 << 17)
fence_enable_signaling(int fence)702 static bool fence_enable_signaling(int fence)
703 {
704 	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
705 }
706 
fence_wait(int fence)707 static bool fence_wait(int fence)
708 {
709 	return poll(&(struct pollfd){fence, POLLIN}, 1, -1) == 1;
710 }
711 
fence_signal(int fd,uint32_t handle,unsigned ring_id,const char * ring_name,int timeout)712 static void fence_signal(int fd, uint32_t handle,
713 			 unsigned ring_id, const char *ring_name,
714 			 int timeout)
715 {
716 #define NFENCES 512
717 	struct drm_i915_gem_execbuffer2 execbuf;
718 	struct drm_i915_gem_exec_object2 obj;
719 	struct timespec start, now;
720 	unsigned engines[16];
721 	unsigned nengine;
722 	int *fences, n;
723 	unsigned long count, signal;
724 
725 	igt_require(gem_has_exec_fence(fd));
726 
727 	nengine = 0;
728 	if (ring_id == ALL_ENGINES) {
729 		for_each_physical_engine(fd, n)
730 			engines[nengine++] = n;
731 	} else {
732 		gem_require_ring(fd, ring_id);
733 		engines[nengine++] = ring_id;
734 	}
735 	igt_require(nengine);
736 
737 	fences = malloc(sizeof(*fences) * NFENCES);
738 	igt_assert(fences);
739 	memset(fences, -1, sizeof(*fences) * NFENCES);
740 
741 	memset(&obj, 0, sizeof(obj));
742 	obj.handle = handle;
743 
744 	memset(&execbuf, 0, sizeof(execbuf));
745 	execbuf.buffers_ptr = to_user_pointer(&obj);
746 	execbuf.buffer_count = 1;
747 	execbuf.flags = LOCAL_EXEC_FENCE_OUT;
748 
749 	n = 0;
750 	count = 0;
751 	signal = 0;
752 
753 	intel_detect_and_clear_missed_interrupts(fd);
754 	clock_gettime(CLOCK_MONOTONIC, &start);
755 	do {
756 		for (int loop = 0; loop < 1024; loop++) {
757 			for (int e = 0; e < nengine; e++) {
758 				if (fences[n] != -1) {
759 					igt_assert(fence_wait(fences[n]));
760 					close(fences[n]);
761 				}
762 
763 				execbuf.flags &= ~ENGINE_FLAGS;
764 				execbuf.flags |= engines[e];
765 				gem_execbuf_wr(fd, &execbuf);
766 
767 				/* Enable signaling by doing a poll() */
768 				fences[n] = execbuf.rsvd2 >> 32;
769 				signal += fence_enable_signaling(fences[n]);
770 
771 				n = (n + 1) % NFENCES;
772 			}
773 		}
774 
775 		count += 1024 * nengine;
776 		clock_gettime(CLOCK_MONOTONIC, &now);
777 	} while (elapsed(&start, &now) < timeout);
778 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
779 
780 	for (n = 0; n < NFENCES; n++)
781 		if (fences[n] != -1)
782 			close(fences[n]);
783 	free(fences);
784 
785 	igt_info("Signal %s: %'lu cycles (%'lu signals): %.3fus\n",
786 		 ring_name, count, signal, elapsed(&start, &now) * 1e6 / count);
787 }
788 
preempt(int fd,uint32_t handle,unsigned ring_id,const char * ring_name)789 static void preempt(int fd, uint32_t handle,
790 		   unsigned ring_id, const char *ring_name)
791 {
792 	struct drm_i915_gem_execbuffer2 execbuf;
793 	struct drm_i915_gem_exec_object2 obj;
794 	struct timespec start, now;
795 	unsigned long count;
796 	uint32_t ctx[2];
797 
798 	gem_require_ring(fd, ring_id);
799 
800 	ctx[0] = gem_context_create(fd);
801 	gem_context_set_priority(fd, ctx[0], MIN_PRIO);
802 
803 	ctx[1] = gem_context_create(fd);
804 	gem_context_set_priority(fd, ctx[1], MAX_PRIO);
805 
806 	memset(&obj, 0, sizeof(obj));
807 	obj.handle = handle;
808 
809 	memset(&execbuf, 0, sizeof(execbuf));
810 	execbuf.buffers_ptr = to_user_pointer(&obj);
811 	execbuf.buffer_count = 1;
812 	execbuf.flags = ring_id;
813 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
814 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
815 	if (__gem_execbuf(fd, &execbuf)) {
816 		execbuf.flags = ring_id;
817 		gem_execbuf(fd, &execbuf);
818 	}
819 	execbuf.rsvd1 = ctx[1];
820 	intel_detect_and_clear_missed_interrupts(fd);
821 
822 	count = 0;
823 	clock_gettime(CLOCK_MONOTONIC, &start);
824 	do {
825 		igt_spin_t *spin =
826 			__igt_spin_new(fd,
827 				       .ctx = ctx[0],
828 				       .engine = ring_id);
829 
830 		for (int loop = 0; loop < 1024; loop++)
831 			gem_execbuf(fd, &execbuf);
832 
833 		igt_spin_free(fd, spin);
834 
835 		count += 1024;
836 		clock_gettime(CLOCK_MONOTONIC, &now);
837 	} while (elapsed(&start, &now) < 20);
838 	igt_assert_eq(intel_detect_and_clear_missed_interrupts(fd), 0);
839 
840 	gem_context_destroy(fd, ctx[1]);
841 	gem_context_destroy(fd, ctx[0]);
842 
843 	igt_info("%s: %'lu cycles: %.3fus\n",
844 		 ring_name, count, elapsed(&start, &now)*1e6 / count);
845 }
846 
847 igt_main
848 {
849 	const struct intel_execution_engine *e;
850 	uint32_t handle = 0;
851 	int device = -1;
852 
853 	igt_fixture {
854 		const uint32_t bbe = MI_BATCH_BUFFER_END;
855 
856 		device = drm_open_driver(DRIVER_INTEL);
857 		igt_require_gem(device);
858 		gem_submission_print_method(device);
859 		gem_scheduler_print_capability(device);
860 
861 		handle = gem_create(device, 4096);
862 		gem_write(device, handle, 0, &bbe, sizeof(bbe));
863 
864 		igt_fork_hang_detector(device);
865 	}
866 
867 	igt_subtest("basic-series")
868 		series(device, handle, 5);
869 
870 	igt_subtest("basic-parallel")
871 		parallel(device, handle, 5);
872 
873 	igt_subtest("basic-sequential")
874 		sequential(device, handle, 0, 5);
875 
876 	for (e = intel_execution_engines; e->name; e++) {
877 		igt_subtest_f("%s", e->name)
878 			single(device, handle, e->exec_id | e->flags, e->name);
879 		igt_subtest_f("signal-%s", e->name)
880 			fence_signal(device, handle, e->exec_id | e->flags, e->name, 5);
881 	}
882 
883 	igt_subtest("signal-all")
884 		fence_signal(device, handle, ALL_ENGINES, "all", 150);
885 
886 	igt_subtest("series")
887 		series(device, handle, 150);
888 
889 	igt_subtest("parallel")
890 		parallel(device, handle, 150);
891 
892 	igt_subtest("sequential")
893 		sequential(device, handle, 0, 150);
894 
895 	igt_subtest("forked-sequential")
896 		sequential(device, handle, FORKED, 150);
897 
898 	igt_subtest("chained-sequential")
899 		sequential(device, handle, FORKED | CHAINED, 150);
900 
901 	igt_subtest("context-sequential")
902 		sequential(device, handle, FORKED | CONTEXT, 150);
903 
904 	igt_subtest_group {
905 		igt_fixture {
906 			gem_require_contexts(device);
907 			igt_require(gem_scheduler_has_ctx_priority(device));
908 			igt_require(gem_scheduler_has_preemption(device));
909 		}
910 
911 		for (e = intel_execution_engines; e->name; e++) {
912 			igt_subtest_f("preempt-%s", e->name)
913 				preempt(device, handle, e->exec_id | e->flags, e->name);
914 		}
915 	}
916 
917 	igt_subtest_group {
918 		igt_fixture {
919 			igt_device_set_master(device);
920 		}
921 
922 		for (e = intel_execution_engines; e->name; e++) {
923 			/* Requires master for STORE_DWORD on gen4/5 */
924 			igt_subtest_f("poll-%s", e->name)
925 				poll_ring(device,
926 					  e->exec_id | e->flags, e->name, 20);
927 		}
928 
929 		igt_subtest("poll-sequential")
930 			poll_sequential(device, "Sequential", 20);
931 
932 		igt_subtest("headless") {
933 			/* Requires master for changing display modes */
934 			headless(device, handle);
935 		}
936 	}
937 
938 	igt_fixture {
939 		igt_stop_hang_detector();
940 		gem_close(device, handle);
941 		close(device);
942 	}
943 }
944