1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <unistd.h>
26 #include <stdlib.h>
27 #include <stdint.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <fcntl.h>
31 #include <inttypes.h>
32 #include <errno.h>
33 #include <poll.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <sys/ioctl.h>
37 #include <sys/time.h>
38 #include <sys/wait.h>
39 #include <time.h>
40 #include <assert.h>
41 #include <limits.h>
42 #include <pthread.h>
43 
44 #include "intel_chipset.h"
45 #include "intel_reg.h"
46 #include "drm.h"
47 #include "ioctl_wrappers.h"
48 #include "drmtest.h"
49 
50 #include "intel_io.h"
51 #include "igt_aux.h"
52 #include "igt_rand.h"
53 #include "igt_perf.h"
54 #include "sw_sync.h"
55 #include "i915/gem_mman.h"
56 
57 #include "ewma.h"
58 
59 enum intel_engine_id {
60 	DEFAULT,
61 	RCS,
62 	BCS,
63 	VCS,
64 	VCS1,
65 	VCS2,
66 	VECS,
67 	NUM_ENGINES
68 };
69 
70 struct duration {
71 	unsigned int min, max;
72 };
73 
74 enum w_type
75 {
76 	BATCH,
77 	SYNC,
78 	DELAY,
79 	PERIOD,
80 	THROTTLE,
81 	QD_THROTTLE,
82 	SW_FENCE,
83 	SW_FENCE_SIGNAL,
84 	CTX_PRIORITY,
85 	PREEMPTION,
86 	ENGINE_MAP,
87 	LOAD_BALANCE,
88 	BOND,
89 	TERMINATE,
90 	SSEU
91 };
92 
93 struct deps
94 {
95 	int nr;
96 	bool submit_fence;
97 	int *list;
98 };
99 
100 struct w_arg {
101 	char *filename;
102 	char *desc;
103 	int prio;
104 	bool sseu;
105 };
106 
107 struct bond {
108 	uint64_t mask;
109 	enum intel_engine_id master;
110 };
111 
112 struct w_step
113 {
114 	/* Workload step metadata */
115 	enum w_type type;
116 	unsigned int context;
117 	unsigned int engine;
118 	struct duration duration;
119 	bool unbound_duration;
120 	struct deps data_deps;
121 	struct deps fence_deps;
122 	int emit_fence;
123 	union {
124 		int sync;
125 		int delay;
126 		int period;
127 		int target;
128 		int throttle;
129 		int fence_signal;
130 		int priority;
131 		struct {
132 			unsigned int engine_map_count;
133 			enum intel_engine_id *engine_map;
134 		};
135 		bool load_balance;
136 		struct {
137 			uint64_t bond_mask;
138 			enum intel_engine_id bond_master;
139 		};
140 		int sseu;
141 	};
142 
143 	/* Implementation details */
144 	unsigned int idx;
145 	struct igt_list rq_link;
146 	unsigned int request;
147 	unsigned int preempt_us;
148 
149 	struct drm_i915_gem_execbuffer2 eb;
150 	struct drm_i915_gem_exec_object2 *obj;
151 	struct drm_i915_gem_relocation_entry reloc[5];
152 	unsigned long bb_sz;
153 	uint32_t bb_handle;
154 	uint32_t *seqno_value;
155 	uint32_t *seqno_address;
156 	uint32_t *rt0_value;
157 	uint32_t *rt0_address;
158 	uint32_t *rt1_address;
159 	uint32_t *latch_value;
160 	uint32_t *latch_address;
161 	uint32_t *recursive_bb_start;
162 };
163 
164 DECLARE_EWMA(uint64_t, rt, 4, 2)
165 
166 struct ctx {
167 	uint32_t id;
168 	int priority;
169 	unsigned int engine_map_count;
170 	enum intel_engine_id *engine_map;
171 	unsigned int bond_count;
172 	struct bond *bonds;
173 	bool targets_instance;
174 	bool wants_balance;
175 	unsigned int static_vcs;
176 	uint64_t sseu;
177 };
178 
179 struct workload
180 {
181 	unsigned int id;
182 
183 	unsigned int nr_steps;
184 	struct w_step *steps;
185 	int prio;
186 	bool sseu;
187 
188 	pthread_t thread;
189 	bool run;
190 	bool background;
191 	const struct workload_balancer *balancer;
192 	unsigned int repeat;
193 	unsigned int flags;
194 	bool print_stats;
195 
196 	uint32_t bb_prng;
197 	uint32_t prng;
198 
199 	struct timespec repeat_start;
200 
201 	unsigned int nr_ctxs;
202 	struct ctx *ctx_list;
203 
204 	int sync_timeline;
205 	uint32_t sync_seqno;
206 
207 	uint32_t seqno[NUM_ENGINES];
208 	struct drm_i915_gem_exec_object2 status_object[2];
209 	uint32_t *status_page;
210 	uint32_t *status_cs;
211 	unsigned int vcs_rr;
212 
213 	unsigned long qd_sum[NUM_ENGINES];
214 	unsigned long nr_bb[NUM_ENGINES];
215 
216 	struct igt_list requests[NUM_ENGINES];
217 	unsigned int nrequest[NUM_ENGINES];
218 
219 	struct workload *global_wrk;
220 	const struct workload_balancer *global_balancer;
221 	pthread_mutex_t mutex;
222 
223 	union {
224 		struct rtavg {
225 			struct ewma_rt avg[NUM_ENGINES];
226 			uint32_t last[NUM_ENGINES];
227 		} rt;
228 	};
229 
230 	struct busy_balancer {
231 		int fd;
232 		bool first;
233 		unsigned int num_engines;
234 		unsigned int engine_map[NUM_ENGINES];
235 		uint64_t t_prev;
236 		uint64_t prev[NUM_ENGINES];
237 		double busy[NUM_ENGINES];
238 	} busy_balancer;
239 };
240 
241 static const unsigned int nop_calibration_us = 1000;
242 static unsigned long nop_calibration;
243 
244 static unsigned int master_prng;
245 
246 static unsigned int context_vcs_rr;
247 
248 static int verbose = 1;
249 static int fd;
250 static struct drm_i915_gem_context_param_sseu device_sseu = {
251 	.slice_mask = -1 /* Force read on first use. */
252 };
253 
254 #define SWAPVCS		(1<<0)
255 #define SEQNO		(1<<1)
256 #define BALANCE		(1<<2)
257 #define RT		(1<<3)
258 #define VCS2REMAP	(1<<4)
259 #define INITVCSRR	(1<<5)
260 #define SYNCEDCLIENTS	(1<<6)
261 #define HEARTBEAT	(1<<7)
262 #define GLOBAL_BALANCE	(1<<8)
263 #define DEPSYNC		(1<<9)
264 #define I915		(1<<10)
265 #define SSEU		(1<<11)
266 
267 #define SEQNO_IDX(engine) ((engine) * 16)
268 #define SEQNO_OFFSET(engine) (SEQNO_IDX(engine) * sizeof(uint32_t))
269 
270 #define RCS_TIMESTAMP (0x2000 + 0x358)
271 #define REG(x) (volatile uint32_t *)((volatile char *)igt_global_mmio + x)
272 
273 static const char *ring_str_map[NUM_ENGINES] = {
274 	[DEFAULT] = "DEFAULT",
275 	[RCS] = "RCS",
276 	[BCS] = "BCS",
277 	[VCS] = "VCS",
278 	[VCS1] = "VCS1",
279 	[VCS2] = "VCS2",
280 	[VECS] = "VECS",
281 };
282 
283 static int
parse_dependencies(unsigned int nr_steps,struct w_step * w,char * _desc)284 parse_dependencies(unsigned int nr_steps, struct w_step *w, char *_desc)
285 {
286 	char *desc = strdup(_desc);
287 	char *token, *tctx = NULL, *tstart = desc;
288 
289 	igt_assert(desc);
290 	igt_assert(!w->data_deps.nr && w->data_deps.nr == w->fence_deps.nr);
291 	igt_assert(!w->data_deps.list &&
292 		   w->data_deps.list == w->fence_deps.list);
293 
294 	while ((token = strtok_r(tstart, "/", &tctx)) != NULL) {
295 		bool submit_fence = false;
296 		char *str = token;
297 		struct deps *deps;
298 		int dep;
299 
300 		tstart = NULL;
301 
302 		if (str[0] == '-' || (str[0] >= '0' && str[0] <= '9')) {
303 			deps = &w->data_deps;
304 		} else {
305 			if (str[0] == 's')
306 				submit_fence = true;
307 			else if (str[0] != 'f')
308 				return -1;
309 
310 			deps = &w->fence_deps;
311 			str++;
312 		}
313 
314 		dep = atoi(str);
315 		if (dep > 0 || ((int)nr_steps + dep) < 0) {
316 			if (deps->list)
317 				free(deps->list);
318 			return -1;
319 		}
320 
321 		if (dep < 0) {
322 			deps->nr++;
323 			/* Multiple fences not yet supported. */
324 			igt_assert(deps->nr == 1 || deps != &w->fence_deps);
325 			deps->list = realloc(deps->list,
326 					     sizeof(*deps->list) * deps->nr);
327 			igt_assert(deps->list);
328 			deps->list[deps->nr - 1] = dep;
329 			deps->submit_fence = submit_fence;
330 		}
331 	}
332 
333 	free(desc);
334 
335 	return 0;
336 }
337 
338 static void __attribute__((format(printf, 1, 2)))
wsim_err(const char * fmt,...)339 wsim_err(const char *fmt, ...)
340 {
341 	va_list ap;
342 
343 	if (!verbose)
344 		return;
345 
346 	va_start(ap, fmt);
347 	vfprintf(stderr, fmt, ap);
348 	va_end(ap);
349 }
350 
351 #define check_arg(cond, fmt, ...) \
352 { \
353 	if (cond) { \
354 		wsim_err(fmt, __VA_ARGS__); \
355 		return NULL; \
356 	} \
357 }
358 
str_to_engine(const char * str)359 static int str_to_engine(const char *str)
360 {
361 	unsigned int i;
362 
363 	for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
364 		if (!strcasecmp(str, ring_str_map[i]))
365 			return i;
366 	}
367 
368 	return -1;
369 }
370 
371 static bool __engines_queried;
372 static unsigned int __num_engines;
373 static struct i915_engine_class_instance *__engines;
374 
375 static int
__i915_query(int i915,struct drm_i915_query * q)376 __i915_query(int i915, struct drm_i915_query *q)
377 {
378 	if (igt_ioctl(i915, DRM_IOCTL_I915_QUERY, q))
379 		return -errno;
380 	return 0;
381 }
382 
383 static int
__i915_query_items(int i915,struct drm_i915_query_item * items,uint32_t n_items)384 __i915_query_items(int i915, struct drm_i915_query_item *items, uint32_t n_items)
385 {
386 	struct drm_i915_query q = {
387 		.num_items = n_items,
388 		.items_ptr = to_user_pointer(items),
389 	};
390 	return __i915_query(i915, &q);
391 }
392 
393 static void
i915_query_items(int i915,struct drm_i915_query_item * items,uint32_t n_items)394 i915_query_items(int i915, struct drm_i915_query_item *items, uint32_t n_items)
395 {
396 	igt_assert_eq(__i915_query_items(i915, items, n_items), 0);
397 }
398 
has_engine_query(int i915)399 static bool has_engine_query(int i915)
400 {
401 	struct drm_i915_query_item item = {
402 		.query_id = DRM_I915_QUERY_ENGINE_INFO,
403 	};
404 
405 	return __i915_query_items(i915, &item, 1) == 0 && item.length > 0;
406 }
407 
query_engines(void)408 static void query_engines(void)
409 {
410 	struct i915_engine_class_instance *engines;
411 	unsigned int num;
412 
413 	if (__engines_queried)
414 		return;
415 
416 	__engines_queried = true;
417 
418 	if (!has_engine_query(fd)) {
419 		unsigned int num_bsd = gem_has_bsd(fd) + gem_has_bsd2(fd);
420 		unsigned int i = 0;
421 
422 		igt_assert(num_bsd);
423 
424 		num = 1 + num_bsd;
425 
426 		if (gem_has_blt(fd))
427 			num++;
428 
429 		if (gem_has_vebox(fd))
430 			num++;
431 
432 		engines = calloc(num,
433 				 sizeof(struct i915_engine_class_instance));
434 		igt_assert(engines);
435 
436 		engines[i].engine_class = I915_ENGINE_CLASS_RENDER;
437 		engines[i].engine_instance = 0;
438 		i++;
439 
440 		if (gem_has_blt(fd)) {
441 			engines[i].engine_class = I915_ENGINE_CLASS_COPY;
442 			engines[i].engine_instance = 0;
443 			i++;
444 		}
445 
446 		if (gem_has_bsd(fd)) {
447 			engines[i].engine_class = I915_ENGINE_CLASS_VIDEO;
448 			engines[i].engine_instance = 0;
449 			i++;
450 		}
451 
452 		if (gem_has_bsd2(fd)) {
453 			engines[i].engine_class = I915_ENGINE_CLASS_VIDEO;
454 			engines[i].engine_instance = 1;
455 			i++;
456 		}
457 
458 		if (gem_has_vebox(fd)) {
459 			engines[i].engine_class =
460 				I915_ENGINE_CLASS_VIDEO_ENHANCE;
461 			engines[i].engine_instance = 0;
462 			i++;
463 		}
464 	} else {
465 		struct drm_i915_query_engine_info *engine_info;
466 		struct drm_i915_query_item item = {
467 			.query_id = DRM_I915_QUERY_ENGINE_INFO,
468 		};
469 		const unsigned int sz = 4096;
470 		unsigned int i;
471 
472 		engine_info = malloc(sz);
473 		igt_assert(engine_info);
474 		memset(engine_info, 0, sz);
475 
476 		item.data_ptr = to_user_pointer(engine_info);
477 		item.length = sz;
478 
479 		i915_query_items(fd, &item, 1);
480 		igt_assert(item.length > 0);
481 		igt_assert(item.length <= sz);
482 
483 		num = engine_info->num_engines;
484 
485 		engines = calloc(num,
486 				 sizeof(struct i915_engine_class_instance));
487 		igt_assert(engines);
488 
489 		for (i = 0; i < num; i++) {
490 			struct drm_i915_engine_info *engine =
491 				(struct drm_i915_engine_info *)&engine_info->engines[i];
492 
493 			engines[i] = engine->engine;
494 		}
495 	}
496 
497 	__engines = engines;
498 	__num_engines = num;
499 }
500 
num_engines_in_class(enum intel_engine_id class)501 static unsigned int num_engines_in_class(enum intel_engine_id class)
502 {
503 	unsigned int i, count = 0;
504 
505 	igt_assert(class == VCS);
506 
507 	query_engines();
508 
509 	for (i = 0; i < __num_engines; i++) {
510 		if (__engines[i].engine_class == I915_ENGINE_CLASS_VIDEO)
511 			count++;
512 	}
513 
514 	igt_assert(count);
515 	return count;
516 }
517 
518 static void
fill_engines_class(struct i915_engine_class_instance * ci,enum intel_engine_id class)519 fill_engines_class(struct i915_engine_class_instance *ci,
520 		   enum intel_engine_id class)
521 {
522 	unsigned int i, j = 0;
523 
524 	igt_assert(class == VCS);
525 
526 	query_engines();
527 
528 	for (i = 0; i < __num_engines; i++) {
529 		if (__engines[i].engine_class != I915_ENGINE_CLASS_VIDEO)
530 			continue;
531 
532 		ci[j].engine_class = __engines[i].engine_class;
533 		ci[j].engine_instance = __engines[i].engine_instance;
534 		j++;
535 	}
536 }
537 
538 static void
fill_engines_id_class(enum intel_engine_id * list,enum intel_engine_id class)539 fill_engines_id_class(enum intel_engine_id *list,
540 		      enum intel_engine_id class)
541 {
542 	enum intel_engine_id engine = VCS1;
543 	unsigned int i, j = 0;
544 
545 	igt_assert(class == VCS);
546 	igt_assert(num_engines_in_class(VCS) <= 2);
547 
548 	query_engines();
549 
550 	for (i = 0; i < __num_engines; i++) {
551 		if (__engines[i].engine_class != I915_ENGINE_CLASS_VIDEO)
552 			continue;
553 
554 		list[j++] = engine++;
555 	}
556 }
557 
558 static unsigned int
find_physical_instance(enum intel_engine_id class,unsigned int logical)559 find_physical_instance(enum intel_engine_id class, unsigned int logical)
560 {
561 	unsigned int i, j = 0;
562 
563 	igt_assert(class == VCS);
564 
565 	for (i = 0; i < __num_engines; i++) {
566 		if (__engines[i].engine_class != I915_ENGINE_CLASS_VIDEO)
567 			continue;
568 
569 		/* Map logical to physical instances. */
570 		if (logical == j++)
571 			return __engines[i].engine_instance;
572 	}
573 
574 	igt_assert(0);
575 	return 0;
576 }
577 
578 static struct i915_engine_class_instance
get_engine(enum intel_engine_id engine)579 get_engine(enum intel_engine_id engine)
580 {
581 	struct i915_engine_class_instance ci;
582 
583 	query_engines();
584 
585 	switch (engine) {
586 	case RCS:
587 		ci.engine_class = I915_ENGINE_CLASS_RENDER;
588 		ci.engine_instance = 0;
589 		break;
590 	case BCS:
591 		ci.engine_class = I915_ENGINE_CLASS_COPY;
592 		ci.engine_instance = 0;
593 		break;
594 	case VCS1:
595 	case VCS2:
596 		ci.engine_class = I915_ENGINE_CLASS_VIDEO;
597 		ci.engine_instance = find_physical_instance(VCS, engine - VCS1);
598 		break;
599 	case VECS:
600 		ci.engine_class = I915_ENGINE_CLASS_VIDEO_ENHANCE;
601 		ci.engine_instance = 0;
602 		break;
603 	default:
604 		igt_assert(0);
605 	};
606 
607 	return ci;
608 }
609 
parse_engine_map(struct w_step * step,const char * _str)610 static int parse_engine_map(struct w_step *step, const char *_str)
611 {
612 	char *token, *tctx = NULL, *tstart = (char *)_str;
613 
614 	while ((token = strtok_r(tstart, "|", &tctx))) {
615 		enum intel_engine_id engine;
616 		unsigned int add;
617 
618 		tstart = NULL;
619 
620 		if (!strcmp(token, "DEFAULT"))
621 			return -1;
622 
623 		engine = str_to_engine(token);
624 		if ((int)engine < 0)
625 			return -1;
626 
627 		if (engine != VCS && engine != VCS1 && engine != VCS2 &&
628 		    engine != RCS)
629 			return -1; /* TODO */
630 
631 		add = engine == VCS ? num_engines_in_class(VCS) : 1;
632 		step->engine_map_count += add;
633 		step->engine_map = realloc(step->engine_map,
634 					   step->engine_map_count *
635 					   sizeof(step->engine_map[0]));
636 
637 		if (engine != VCS)
638 			step->engine_map[step->engine_map_count - add] = engine;
639 		else
640 			fill_engines_id_class(&step->engine_map[step->engine_map_count - add], VCS);
641 	}
642 
643 	return 0;
644 }
645 
engine_list_mask(const char * _str)646 static uint64_t engine_list_mask(const char *_str)
647 {
648 	uint64_t mask = 0;
649 
650 	char *token, *tctx = NULL, *tstart = (char *)_str;
651 
652 	while ((token = strtok_r(tstart, "|", &tctx))) {
653 		enum intel_engine_id engine = str_to_engine(token);
654 
655 		if ((int)engine < 0 || engine == DEFAULT || engine == VCS)
656 			return 0;
657 
658 		mask |= 1 << engine;
659 
660 		tstart = NULL;
661 	}
662 
663 	return mask;
664 }
665 
666 #define int_field(_STEP_, _FIELD_, _COND_, _ERR_) \
667 	if ((field = strtok_r(fstart, ".", &fctx))) { \
668 		tmp = atoi(field); \
669 		check_arg(_COND_, _ERR_, nr_steps); \
670 		step.type = _STEP_; \
671 		step._FIELD_ = tmp; \
672 		goto add_step; \
673 	} \
674 
675 static struct workload *
parse_workload(struct w_arg * arg,unsigned int flags,struct workload * app_w)676 parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
677 {
678 	struct workload *wrk;
679 	unsigned int nr_steps = 0;
680 	char *desc = strdup(arg->desc);
681 	char *_token, *token, *tctx = NULL, *tstart = desc;
682 	char *field, *fctx = NULL, *fstart;
683 	struct w_step step, *steps = NULL;
684 	bool bcs_used = false;
685 	unsigned int valid;
686 	int i, j, tmp;
687 
688 	igt_assert(desc);
689 
690 	while ((_token = strtok_r(tstart, ",", &tctx))) {
691 		tstart = NULL;
692 		token = strdup(_token);
693 		igt_assert(token);
694 		fstart = token;
695 		valid = 0;
696 		memset(&step, 0, sizeof(step));
697 
698 		if ((field = strtok_r(fstart, ".", &fctx))) {
699 			fstart = NULL;
700 
701 			if (!strcmp(field, "d")) {
702 				int_field(DELAY, delay, tmp <= 0,
703 					  "Invalid delay at step %u!\n");
704 			} else if (!strcmp(field, "p")) {
705 				int_field(PERIOD, period, tmp <= 0,
706 					  "Invalid period at step %u!\n");
707 			} else if (!strcmp(field, "P")) {
708 				unsigned int nr = 0;
709 				while ((field = strtok_r(fstart, ".", &fctx))) {
710 					tmp = atoi(field);
711 					check_arg(nr == 0 && tmp <= 0,
712 						  "Invalid context at step %u!\n",
713 						  nr_steps);
714 					check_arg(nr > 1,
715 						  "Invalid priority format at step %u!\n",
716 						  nr_steps);
717 
718 					if (nr == 0)
719 						step.context = tmp;
720 					else
721 						step.priority = tmp;
722 
723 					nr++;
724 				}
725 
726 				step.type = CTX_PRIORITY;
727 				goto add_step;
728 			} else if (!strcmp(field, "s")) {
729 				int_field(SYNC, target,
730 					  tmp >= 0 || ((int)nr_steps + tmp) < 0,
731 					  "Invalid sync target at step %u!\n");
732 			} else if (!strcmp(field, "S")) {
733 				unsigned int nr = 0;
734 				while ((field = strtok_r(fstart, ".", &fctx))) {
735 					tmp = atoi(field);
736 					check_arg(tmp <= 0 && nr == 0,
737 						  "Invalid context at step %u!\n",
738 						  nr_steps);
739 					check_arg(nr > 1,
740 						  "Invalid SSEU format at step %u!\n",
741 						  nr_steps);
742 
743 					if (nr == 0)
744 						step.context = tmp;
745 					else if (nr == 1)
746 						step.sseu = tmp;
747 
748 					nr++;
749 				}
750 
751 				step.type = SSEU;
752 				goto add_step;
753 			} else if (!strcmp(field, "t")) {
754 				int_field(THROTTLE, throttle,
755 					  tmp < 0,
756 					  "Invalid throttle at step %u!\n");
757 			} else if (!strcmp(field, "q")) {
758 				int_field(QD_THROTTLE, throttle,
759 					  tmp < 0,
760 					  "Invalid qd throttle at step %u!\n");
761 			} else if (!strcmp(field, "a")) {
762 				int_field(SW_FENCE_SIGNAL, target,
763 					  tmp >= 0,
764 					  "Invalid sw fence signal at step %u!\n");
765 			} else if (!strcmp(field, "f")) {
766 				step.type = SW_FENCE;
767 				goto add_step;
768 			} else if (!strcmp(field, "M")) {
769 				unsigned int nr = 0;
770 				while ((field = strtok_r(fstart, ".", &fctx))) {
771 					tmp = atoi(field);
772 					check_arg(nr == 0 && tmp <= 0,
773 						  "Invalid context at step %u!\n",
774 						  nr_steps);
775 					check_arg(nr > 1,
776 						  "Invalid engine map format at step %u!\n",
777 						  nr_steps);
778 
779 					if (nr == 0) {
780 						step.context = tmp;
781 					} else {
782 						tmp = parse_engine_map(&step,
783 								       field);
784 						check_arg(tmp < 0,
785 							  "Invalid engine map list at step %u!\n",
786 							  nr_steps);
787 					}
788 
789 					nr++;
790 				}
791 
792 				step.type = ENGINE_MAP;
793 				goto add_step;
794 			} else if (!strcmp(field, "T")) {
795 				int_field(TERMINATE, target,
796 					  tmp >= 0 || ((int)nr_steps + tmp) < 0,
797 					  "Invalid terminate target at step %u!\n");
798 			} else if (!strcmp(field, "X")) {
799 				unsigned int nr = 0;
800 				while ((field = strtok_r(fstart, ".", &fctx))) {
801 					tmp = atoi(field);
802 					check_arg(nr == 0 && tmp <= 0,
803 						  "Invalid context at step %u!\n",
804 						  nr_steps);
805 					check_arg(nr == 1 && tmp < 0,
806 						  "Invalid preemption period at step %u!\n",
807 						  nr_steps);
808 					check_arg(nr > 1,
809 						  "Invalid preemption format at step %u!\n",
810 						  nr_steps);
811 
812 					if (nr == 0)
813 						step.context = tmp;
814 					else
815 						step.period = tmp;
816 
817 					nr++;
818 				}
819 
820 				step.type = PREEMPTION;
821 				goto add_step;
822 			} else if (!strcmp(field, "B")) {
823 				unsigned int nr = 0;
824 				while ((field = strtok_r(fstart, ".", &fctx))) {
825 					tmp = atoi(field);
826 					check_arg(nr == 0 && tmp <= 0,
827 						  "Invalid context at step %u!\n",
828 						  nr_steps);
829 					check_arg(nr > 0,
830 						  "Invalid load balance format at step %u!\n",
831 						  nr_steps);
832 
833 					step.context = tmp;
834 					step.load_balance = true;
835 
836 					nr++;
837 				}
838 
839 				step.type = LOAD_BALANCE;
840 				goto add_step;
841 			} else if (!strcmp(field, "b")) {
842 				unsigned int nr = 0;
843 				while ((field = strtok_r(fstart, ".", &fctx))) {
844 					check_arg(nr > 2,
845 						  "Invalid bond format at step %u!\n",
846 						  nr_steps);
847 
848 					if (nr == 0) {
849 						tmp = atoi(field);
850 						step.context = tmp;
851 						check_arg(tmp <= 0,
852 							  "Invalid context at step %u!\n",
853 							  nr_steps);
854 					} else if (nr == 1) {
855 						step.bond_mask = engine_list_mask(field);
856 						check_arg(step.bond_mask == 0,
857 							"Invalid siblings list at step %u!\n",
858 							nr_steps);
859 					} else if (nr == 2) {
860 						tmp = str_to_engine(field);
861 						check_arg(tmp <= 0 ||
862 							  tmp == VCS ||
863 							  tmp == DEFAULT,
864 							  "Invalid master engine at step %u!\n",
865 							  nr_steps);
866 						step.bond_master = tmp;
867 					}
868 
869 					nr++;
870 				}
871 
872 				step.type = BOND;
873 				goto add_step;
874 			}
875 
876 			if (!field) {
877 				if (verbose)
878 					fprintf(stderr,
879 						"Parse error at step %u!\n",
880 						nr_steps);
881 				return NULL;
882 			}
883 
884 			tmp = atoi(field);
885 			check_arg(tmp < 0, "Invalid ctx id at step %u!\n",
886 				  nr_steps);
887 			step.context = tmp;
888 
889 			valid++;
890 		}
891 
892 		if ((field = strtok_r(fstart, ".", &fctx))) {
893 			fstart = NULL;
894 
895 			i = str_to_engine(field);
896 			check_arg(i < 0,
897 				  "Invalid engine id at step %u!\n", nr_steps);
898 
899 			valid++;
900 
901 			step.engine = i;
902 
903 			if (step.engine == BCS)
904 				bcs_used = true;
905 		}
906 
907 		if ((field = strtok_r(fstart, ".", &fctx))) {
908 			char *sep = NULL;
909 			long int tmpl;
910 
911 			fstart = NULL;
912 
913 			if (field[0] == '*') {
914 				check_arg(intel_gen(intel_get_drm_devid(fd)) < 8,
915 					  "Infinite batch at step %u needs Gen8+!\n",
916 					  nr_steps);
917 				step.unbound_duration = true;
918 			} else {
919 				tmpl = strtol(field, &sep, 10);
920 				check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
921 					  tmpl == LONG_MAX,
922 					  "Invalid duration at step %u!\n",
923 					  nr_steps);
924 				step.duration.min = tmpl;
925 
926 				if (sep && *sep == '-') {
927 					tmpl = strtol(sep + 1, NULL, 10);
928 					check_arg(tmpl <= 0 ||
929 						tmpl <= step.duration.min ||
930 						tmpl == LONG_MIN ||
931 						tmpl == LONG_MAX,
932 						"Invalid duration range at step %u!\n",
933 						nr_steps);
934 					step.duration.max = tmpl;
935 				} else {
936 					step.duration.max = step.duration.min;
937 				}
938 			}
939 
940 			valid++;
941 		}
942 
943 		if ((field = strtok_r(fstart, ".", &fctx))) {
944 			fstart = NULL;
945 
946 			tmp = parse_dependencies(nr_steps, &step, field);
947 			check_arg(tmp < 0,
948 				  "Invalid dependency at step %u!\n", nr_steps);
949 
950 			valid++;
951 		}
952 
953 		if ((field = strtok_r(fstart, ".", &fctx))) {
954 			fstart = NULL;
955 
956 			check_arg(strlen(field) != 1 ||
957 				  (field[0] != '0' && field[0] != '1'),
958 				  "Invalid wait boolean at step %u!\n",
959 				  nr_steps);
960 			step.sync = field[0] - '0';
961 
962 			valid++;
963 		}
964 
965 		check_arg(valid != 5, "Invalid record at step %u!\n", nr_steps);
966 
967 		step.type = BATCH;
968 
969 add_step:
970 		step.idx = nr_steps++;
971 		step.request = -1;
972 		steps = realloc(steps, sizeof(step) * nr_steps);
973 		igt_assert(steps);
974 
975 		memcpy(&steps[nr_steps - 1], &step, sizeof(step));
976 
977 		free(token);
978 	}
979 
980 	if (app_w) {
981 		steps = realloc(steps, sizeof(step) *
982 				(nr_steps + app_w->nr_steps));
983 		igt_assert(steps);
984 
985 		memcpy(&steps[nr_steps], app_w->steps,
986 		       sizeof(step) * app_w->nr_steps);
987 
988 		for (i = 0; i < app_w->nr_steps; i++)
989 			steps[nr_steps + i].idx += nr_steps;
990 
991 		nr_steps += app_w->nr_steps;
992 	}
993 
994 	wrk = malloc(sizeof(*wrk));
995 	igt_assert(wrk);
996 
997 	wrk->nr_steps = nr_steps;
998 	wrk->steps = steps;
999 	wrk->prio = arg->prio;
1000 	wrk->sseu = arg->sseu;
1001 
1002 	free(desc);
1003 
1004 	/*
1005 	 * Tag all steps which need to emit a sync fence if another step is
1006 	 * referencing them as a sync fence dependency.
1007 	 */
1008 	for (i = 0; i < nr_steps; i++) {
1009 		for (j = 0; j < steps[i].fence_deps.nr; j++) {
1010 			tmp = steps[i].idx + steps[i].fence_deps.list[j];
1011 			check_arg(tmp < 0 || tmp >= i ||
1012 				  (steps[tmp].type != BATCH &&
1013 				   steps[tmp].type != SW_FENCE),
1014 				  "Invalid dependency target %u!\n", i);
1015 			steps[tmp].emit_fence = -1;
1016 		}
1017 	}
1018 
1019 	/* Validate SW_FENCE_SIGNAL targets. */
1020 	for (i = 0; i < nr_steps; i++) {
1021 		if (steps[i].type == SW_FENCE_SIGNAL) {
1022 			tmp = steps[i].idx + steps[i].target;
1023 			check_arg(tmp < 0 || tmp >= i ||
1024 				  steps[tmp].type != SW_FENCE,
1025 				  "Invalid sw fence target %u!\n", i);
1026 		}
1027 	}
1028 
1029 	if (bcs_used && (flags & VCS2REMAP) && verbose)
1030 		printf("BCS usage in workload with VCS2 remapping enabled!\n");
1031 
1032 	return wrk;
1033 }
1034 
1035 static struct workload *
clone_workload(struct workload * _wrk)1036 clone_workload(struct workload *_wrk)
1037 {
1038 	struct workload *wrk;
1039 	int i;
1040 
1041 	wrk = malloc(sizeof(*wrk));
1042 	igt_assert(wrk);
1043 	memset(wrk, 0, sizeof(*wrk));
1044 
1045 	wrk->prio = _wrk->prio;
1046 	wrk->sseu = _wrk->sseu;
1047 	wrk->nr_steps = _wrk->nr_steps;
1048 	wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step));
1049 	igt_assert(wrk->steps);
1050 
1051 	memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
1052 
1053 	/* Check if we need a sw sync timeline. */
1054 	for (i = 0; i < wrk->nr_steps; i++) {
1055 		if (wrk->steps[i].type == SW_FENCE) {
1056 			wrk->sync_timeline = sw_sync_timeline_create();
1057 			igt_assert(wrk->sync_timeline >= 0);
1058 			break;
1059 		}
1060 	}
1061 
1062 	for (i = 0; i < NUM_ENGINES; i++)
1063 		igt_list_init(&wrk->requests[i]);
1064 
1065 	return wrk;
1066 }
1067 
1068 #define rounddown(x, y) (x - (x%y))
1069 #ifndef PAGE_SIZE
1070 #define PAGE_SIZE (4096)
1071 #endif
1072 
get_duration(struct workload * wrk,struct w_step * w)1073 static unsigned int get_duration(struct workload *wrk, struct w_step *w)
1074 {
1075 	struct duration *dur = &w->duration;
1076 
1077 	if (dur->min == dur->max)
1078 		return dur->min;
1079 	else
1080 		return dur->min + hars_petruska_f54_1_random(&wrk->bb_prng) %
1081 		       (dur->max + 1 - dur->min);
1082 }
1083 
get_bb_sz(unsigned int duration)1084 static unsigned long get_bb_sz(unsigned int duration)
1085 {
1086 	return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
1087 		     nop_calibration_us, sizeof(uint32_t));
1088 }
1089 
1090 static void
init_bb(struct w_step * w,unsigned int flags)1091 init_bb(struct w_step *w, unsigned int flags)
1092 {
1093 	const unsigned int arb_period =
1094 			get_bb_sz(w->preempt_us) / sizeof(uint32_t);
1095 	const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
1096 	unsigned int i;
1097 	uint32_t *ptr;
1098 
1099 	if (w->unbound_duration || !arb_period)
1100 		return;
1101 
1102 	gem_set_domain(fd, w->bb_handle,
1103 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
1104 
1105 	ptr = gem_mmap__wc(fd, w->bb_handle, 0, mmap_len, PROT_WRITE);
1106 
1107 	for (i = arb_period; i < w->bb_sz / sizeof(uint32_t); i += arb_period)
1108 		ptr[i] = 0x5 << 23; /* MI_ARB_CHK */
1109 
1110 	munmap(ptr, mmap_len);
1111 }
1112 
1113 static unsigned int
terminate_bb(struct w_step * w,unsigned int flags)1114 terminate_bb(struct w_step *w, unsigned int flags)
1115 {
1116 	const uint32_t bbe = 0xa << 23;
1117 	unsigned long mmap_start, mmap_len;
1118 	unsigned long batch_start = w->bb_sz;
1119 	unsigned int r = 0;
1120 	uint32_t *ptr, *cs;
1121 
1122 	igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
1123 
1124 	batch_start -= sizeof(uint32_t); /* bbend */
1125 	if (flags & SEQNO)
1126 		batch_start -= 4 * sizeof(uint32_t);
1127 	if (flags & RT)
1128 		batch_start -= 12 * sizeof(uint32_t);
1129 
1130 	if (w->unbound_duration)
1131 		batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
1132 
1133 	mmap_start = rounddown(batch_start, PAGE_SIZE);
1134 	mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
1135 
1136 	gem_set_domain(fd, w->bb_handle,
1137 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
1138 
1139 	ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
1140 	cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
1141 
1142 	if (w->unbound_duration) {
1143 		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
1144 		batch_start += 4 * sizeof(uint32_t);
1145 
1146 		*cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
1147 		w->recursive_bb_start = cs;
1148 		*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1149 		*cs++ = 0;
1150 		*cs++ = 0;
1151 	}
1152 
1153 	if (flags & SEQNO) {
1154 		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
1155 		batch_start += 4 * sizeof(uint32_t);
1156 
1157 		*cs++ = MI_STORE_DWORD_IMM;
1158 		w->seqno_address = cs;
1159 		*cs++ = 0;
1160 		*cs++ = 0;
1161 		w->seqno_value = cs;
1162 		*cs++ = 0;
1163 	}
1164 
1165 	if (flags & RT) {
1166 		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
1167 		batch_start += 4 * sizeof(uint32_t);
1168 
1169 		*cs++ = MI_STORE_DWORD_IMM;
1170 		w->rt0_address = cs;
1171 		*cs++ = 0;
1172 		*cs++ = 0;
1173 		w->rt0_value = cs;
1174 		*cs++ = 0;
1175 
1176 		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
1177 		batch_start += 4 * sizeof(uint32_t);
1178 
1179 		*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
1180 		*cs++ = RCS_TIMESTAMP;
1181 		w->rt1_address = cs;
1182 		*cs++ = 0;
1183 		*cs++ = 0;
1184 
1185 		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
1186 		batch_start += 4 * sizeof(uint32_t);
1187 
1188 		*cs++ = MI_STORE_DWORD_IMM;
1189 		w->latch_address = cs;
1190 		*cs++ = 0;
1191 		*cs++ = 0;
1192 		w->latch_value = cs;
1193 		*cs++ = 0;
1194 	}
1195 
1196 	*cs = bbe;
1197 
1198 	return r;
1199 }
1200 
1201 static const unsigned int eb_engine_map[NUM_ENGINES] = {
1202 	[DEFAULT] = I915_EXEC_DEFAULT,
1203 	[RCS] = I915_EXEC_RENDER,
1204 	[BCS] = I915_EXEC_BLT,
1205 	[VCS] = I915_EXEC_BSD,
1206 	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
1207 	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
1208 	[VECS] = I915_EXEC_VEBOX
1209 };
1210 
1211 static void
eb_set_engine(struct drm_i915_gem_execbuffer2 * eb,enum intel_engine_id engine,unsigned int flags)1212 eb_set_engine(struct drm_i915_gem_execbuffer2 *eb,
1213 	      enum intel_engine_id engine,
1214 	      unsigned int flags)
1215 {
1216 	if (engine == VCS2 && (flags & VCS2REMAP))
1217 		engine = BCS;
1218 
1219 	if ((flags & I915) && engine == VCS)
1220 		eb->flags = 0;
1221 	else
1222 		eb->flags = eb_engine_map[engine];
1223 }
1224 
1225 static unsigned int
find_engine_in_map(struct ctx * ctx,enum intel_engine_id engine)1226 find_engine_in_map(struct ctx *ctx, enum intel_engine_id engine)
1227 {
1228 	unsigned int i;
1229 
1230 	for (i = 0; i < ctx->engine_map_count; i++) {
1231 		if (ctx->engine_map[i] == engine)
1232 			return i + 1;
1233 	}
1234 
1235 	igt_assert(ctx->wants_balance);
1236 	return 0;
1237 }
1238 
1239 static struct ctx *
__get_ctx(struct workload * wrk,struct w_step * w)1240 __get_ctx(struct workload *wrk, struct w_step *w)
1241 {
1242 	return &wrk->ctx_list[w->context * 2];
1243 }
1244 
1245 static void
eb_update_flags(struct workload * wrk,struct w_step * w,enum intel_engine_id engine,unsigned int flags)1246 eb_update_flags(struct workload *wrk, struct w_step *w,
1247 		enum intel_engine_id engine, unsigned int flags)
1248 {
1249 	struct ctx *ctx = __get_ctx(wrk, w);
1250 
1251 	if (ctx->engine_map)
1252 		w->eb.flags = find_engine_in_map(ctx, engine);
1253 	else
1254 		eb_set_engine(&w->eb, engine, flags);
1255 
1256 	w->eb.flags |= I915_EXEC_HANDLE_LUT;
1257 	w->eb.flags |= I915_EXEC_NO_RELOC;
1258 
1259 	igt_assert(w->emit_fence <= 0);
1260 	if (w->emit_fence)
1261 		w->eb.flags |= I915_EXEC_FENCE_OUT;
1262 }
1263 
1264 static struct drm_i915_gem_exec_object2 *
get_status_objects(struct workload * wrk)1265 get_status_objects(struct workload *wrk)
1266 {
1267 	if (wrk->flags & GLOBAL_BALANCE)
1268 		return wrk->global_wrk->status_object;
1269 	else
1270 		return wrk->status_object;
1271 }
1272 
1273 static uint32_t
get_ctxid(struct workload * wrk,struct w_step * w)1274 get_ctxid(struct workload *wrk, struct w_step *w)
1275 {
1276 	struct ctx *ctx = __get_ctx(wrk, w);
1277 
1278 	if (ctx->targets_instance && ctx->wants_balance && w->engine == VCS)
1279 		return wrk->ctx_list[w->context * 2 + 1].id;
1280 	else
1281 		return wrk->ctx_list[w->context * 2].id;
1282 }
1283 
1284 static void
alloc_step_batch(struct workload * wrk,struct w_step * w,unsigned int flags)1285 alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
1286 {
1287 	enum intel_engine_id engine = w->engine;
1288 	unsigned int j = 0;
1289 	unsigned int nr_obj = 3 + w->data_deps.nr;
1290 	unsigned int i;
1291 
1292 	w->obj = calloc(nr_obj, sizeof(*w->obj));
1293 	igt_assert(w->obj);
1294 
1295 	w->obj[j].handle = gem_create(fd, 4096);
1296 	w->obj[j].flags = EXEC_OBJECT_WRITE;
1297 	j++;
1298 	igt_assert(j < nr_obj);
1299 
1300 	if (flags & SEQNO) {
1301 		w->obj[j++] = get_status_objects(wrk)[0];
1302 		igt_assert(j < nr_obj);
1303 	}
1304 
1305 	for (i = 0; i < w->data_deps.nr; i++) {
1306 		igt_assert(w->data_deps.list[i] <= 0);
1307 		if (w->data_deps.list[i]) {
1308 			int dep_idx = w->idx + w->data_deps.list[i];
1309 
1310 			igt_assert(dep_idx >= 0 && dep_idx < w->idx);
1311 			igt_assert(wrk->steps[dep_idx].type == BATCH);
1312 
1313 			w->obj[j].handle = wrk->steps[dep_idx].obj[0].handle;
1314 			j++;
1315 			igt_assert(j < nr_obj);
1316 		}
1317 	}
1318 
1319 	if (w->unbound_duration)
1320 		/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
1321 		w->bb_sz = max(PAGE_SIZE, get_bb_sz(w->preempt_us)) +
1322 			   (1 + 3) * sizeof(uint32_t);
1323 	else
1324 		w->bb_sz = get_bb_sz(w->duration.max);
1325 	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
1326 	init_bb(w, flags);
1327 	w->obj[j].relocation_count = terminate_bb(w, flags);
1328 
1329 	if (w->obj[j].relocation_count) {
1330 		w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
1331 		for (i = 0; i < w->obj[j].relocation_count; i++)
1332 			w->reloc[i].target_handle = 1;
1333 		if (w->unbound_duration)
1334 			w->reloc[0].target_handle = j;
1335 	}
1336 
1337 	w->eb.buffers_ptr = to_user_pointer(w->obj);
1338 	w->eb.buffer_count = j + 1;
1339 	w->eb.rsvd1 = get_ctxid(wrk, w);
1340 
1341 	if (flags & SWAPVCS && engine == VCS1)
1342 		engine = VCS2;
1343 	else if (flags & SWAPVCS && engine == VCS2)
1344 		engine = VCS1;
1345 	eb_update_flags(wrk, w, engine, flags);
1346 #ifdef DEBUG
1347 	printf("%u: %u:|", w->idx, w->eb.buffer_count);
1348 	for (i = 0; i <= j; i++)
1349 		printf("%x|", w->obj[i].handle);
1350 	printf(" %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
1351 		w->bb_sz, w->eb.flags, w->bb_handle, j, w->context,
1352 		get_ctxid(wrk, w));
1353 #endif
1354 }
1355 
__ctx_set_prio(uint32_t ctx_id,unsigned int prio)1356 static void __ctx_set_prio(uint32_t ctx_id, unsigned int prio)
1357 {
1358 	struct drm_i915_gem_context_param param = {
1359 		.ctx_id = ctx_id,
1360 		.param = I915_CONTEXT_PARAM_PRIORITY,
1361 		.value = prio,
1362 	};
1363 
1364 	if (prio)
1365 		gem_context_set_param(fd, &param);
1366 }
1367 
__vm_destroy(int i915,uint32_t vm_id)1368 static int __vm_destroy(int i915, uint32_t vm_id)
1369 {
1370 	struct drm_i915_gem_vm_control ctl = { .vm_id = vm_id };
1371 	int err = 0;
1372 
1373 	if (igt_ioctl(i915, DRM_IOCTL_I915_GEM_VM_DESTROY, &ctl)) {
1374 		err = -errno;
1375 		igt_assume(err);
1376 	}
1377 
1378 	errno = 0;
1379 	return err;
1380 }
1381 
vm_destroy(int i915,uint32_t vm_id)1382 static void vm_destroy(int i915, uint32_t vm_id)
1383 {
1384 	igt_assert_eq(__vm_destroy(i915, vm_id), 0);
1385 }
1386 
1387 static unsigned int
find_engine(struct i915_engine_class_instance * ci,unsigned int count,enum intel_engine_id engine)1388 find_engine(struct i915_engine_class_instance *ci, unsigned int count,
1389 	    enum intel_engine_id engine)
1390 {
1391 	struct i915_engine_class_instance e = get_engine(engine);
1392 	unsigned int i;
1393 
1394 	for (i = 0; i < count; i++, ci++) {
1395 		if (!memcmp(&e, ci, sizeof(*ci)))
1396 			return i;
1397 	}
1398 
1399 	igt_assert(0);
1400 	return 0;
1401 }
1402 
get_device_sseu(void)1403 static struct drm_i915_gem_context_param_sseu get_device_sseu(void)
1404 {
1405 	struct drm_i915_gem_context_param param = { };
1406 
1407 	if (device_sseu.slice_mask == -1) {
1408 		param.param = I915_CONTEXT_PARAM_SSEU;
1409 		param.value = (uintptr_t)&device_sseu;
1410 
1411 		gem_context_get_param(fd, &param);
1412 	}
1413 
1414 	return device_sseu;
1415 }
1416 
1417 static uint64_t
set_ctx_sseu(struct ctx * ctx,uint64_t slice_mask)1418 set_ctx_sseu(struct ctx *ctx, uint64_t slice_mask)
1419 {
1420 	struct drm_i915_gem_context_param_sseu sseu = get_device_sseu();
1421 	struct drm_i915_gem_context_param param = { };
1422 
1423 	if (slice_mask == -1)
1424 		slice_mask = device_sseu.slice_mask;
1425 
1426 	if (ctx->engine_map && ctx->wants_balance) {
1427 		sseu.flags = I915_CONTEXT_SSEU_FLAG_ENGINE_INDEX;
1428 		sseu.engine.engine_class = I915_ENGINE_CLASS_INVALID;
1429 		sseu.engine.engine_instance = 0;
1430 	}
1431 
1432 	sseu.slice_mask = slice_mask;
1433 
1434 	param.ctx_id = ctx->id;
1435 	param.param = I915_CONTEXT_PARAM_SSEU;
1436 	param.size = sizeof(sseu);
1437 	param.value = (uintptr_t)&sseu;
1438 
1439 	gem_context_set_param(fd, &param);
1440 
1441 	return slice_mask;
1442 }
1443 
sizeof_load_balance(int count)1444 static size_t sizeof_load_balance(int count)
1445 {
1446 	return offsetof(struct i915_context_engines_load_balance,
1447 			engines[count]);
1448 }
1449 
sizeof_param_engines(int count)1450 static size_t sizeof_param_engines(int count)
1451 {
1452 	return offsetof(struct i915_context_param_engines,
1453 			engines[count]);
1454 }
1455 
sizeof_engines_bond(int count)1456 static size_t sizeof_engines_bond(int count)
1457 {
1458 	return offsetof(struct i915_context_engines_bond,
1459 			engines[count]);
1460 }
1461 
1462 #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
1463 
1464 static int
prepare_workload(unsigned int id,struct workload * wrk,unsigned int flags)1465 prepare_workload(unsigned int id, struct workload *wrk, unsigned int flags)
1466 {
1467 	unsigned int ctx_vcs;
1468 	int max_ctx = -1;
1469 	struct w_step *w;
1470 	int i, j;
1471 
1472 	wrk->id = id;
1473 	wrk->prng = rand();
1474 	wrk->bb_prng = (wrk->flags & SYNCEDCLIENTS) ? master_prng : rand();
1475 	wrk->run = true;
1476 
1477 	ctx_vcs =  0;
1478 	if (flags & INITVCSRR)
1479 		ctx_vcs = id & 1;
1480 	wrk->vcs_rr = ctx_vcs;
1481 
1482 	if (flags & GLOBAL_BALANCE) {
1483 		int ret = pthread_mutex_init(&wrk->mutex, NULL);
1484 		igt_assert(ret == 0);
1485 	}
1486 
1487 	if (flags & SEQNO) {
1488 		if (!(flags & GLOBAL_BALANCE) || id == 0) {
1489 			uint32_t handle;
1490 
1491 			handle = gem_create(fd, 4096);
1492 			gem_set_caching(fd, handle, I915_CACHING_CACHED);
1493 			wrk->status_object[0].handle = handle;
1494 			wrk->status_page = gem_mmap__cpu(fd, handle, 0, 4096,
1495 							 PROT_READ);
1496 
1497 			handle = gem_create(fd, 4096);
1498 			wrk->status_object[1].handle = handle;
1499 			wrk->status_cs = gem_mmap__wc(fd, handle,
1500 						      0, 4096, PROT_WRITE);
1501 		}
1502 	}
1503 
1504 	/*
1505 	 * Pre-scan workload steps to allocate context list storage.
1506 	 */
1507 	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1508 		int ctx = w->context * 2 + 1; /* Odd slots are special. */
1509 		int delta;
1510 
1511 		if (ctx <= max_ctx)
1512 			continue;
1513 
1514 		delta = ctx + 1 - wrk->nr_ctxs;
1515 
1516 		wrk->nr_ctxs += delta;
1517 		wrk->ctx_list = realloc(wrk->ctx_list,
1518 					wrk->nr_ctxs * sizeof(*wrk->ctx_list));
1519 		memset(&wrk->ctx_list[wrk->nr_ctxs - delta], 0,
1520 			delta * sizeof(*wrk->ctx_list));
1521 
1522 		max_ctx = ctx;
1523 	}
1524 
1525 	/*
1526 	 * Identify if contexts target specific engine instances and if they
1527 	 * want to be balanced.
1528 	 *
1529 	 * Transfer over engine map configuration from the workload step.
1530 	 */
1531 	for (j = 0; j < wrk->nr_ctxs; j += 2) {
1532 		struct ctx *ctx = &wrk->ctx_list[j];
1533 
1534 		bool targets = false;
1535 		bool balance = false;
1536 
1537 		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1538 			if (w->context != (j / 2))
1539 				continue;
1540 
1541 			if (w->type == BATCH) {
1542 				if (w->engine == VCS)
1543 					balance = true;
1544 				else
1545 					targets = true;
1546 			} else if (w->type == ENGINE_MAP) {
1547 				ctx->engine_map = w->engine_map;
1548 				ctx->engine_map_count = w->engine_map_count;
1549 			} else if (w->type == LOAD_BALANCE) {
1550 				if (!ctx->engine_map) {
1551 					wsim_err("Load balancing needs an engine map!\n");
1552 					return 1;
1553 				}
1554 				ctx->wants_balance = w->load_balance;
1555 			} else if (w->type == BOND) {
1556 				if (!ctx->wants_balance) {
1557 					wsim_err("Engine bonds need load balancing engine map!\n");
1558 					return 1;
1559 				}
1560 				ctx->bond_count++;
1561 				ctx->bonds = realloc(ctx->bonds,
1562 						     ctx->bond_count *
1563 						     sizeof(struct bond));
1564 				igt_assert(ctx->bonds);
1565 				ctx->bonds[ctx->bond_count - 1].mask =
1566 					w->bond_mask;
1567 				ctx->bonds[ctx->bond_count - 1].master =
1568 					w->bond_master;
1569 			}
1570 		}
1571 
1572 		wrk->ctx_list[j].targets_instance = targets;
1573 		if (flags & I915)
1574 			wrk->ctx_list[j].wants_balance |= balance;
1575 	}
1576 
1577 	/*
1578 	 * Ensure VCS is not allowed with engine map contexts.
1579 	 */
1580 	for (j = 0; j < wrk->nr_ctxs; j += 2) {
1581 		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1582 			if (w->context != (j / 2))
1583 				continue;
1584 
1585 			if (w->type != BATCH)
1586 				continue;
1587 
1588 			if (wrk->ctx_list[j].engine_map &&
1589 			    !wrk->ctx_list[j].wants_balance &&
1590 			    (w->engine == VCS || w->engine == DEFAULT)) {
1591 				wsim_err("Batches targetting engine maps must use explicit engines!\n");
1592 				return -1;
1593 			}
1594 		}
1595 	}
1596 
1597 
1598 	/*
1599 	 * Create and configure contexts.
1600 	 */
1601 	for (i = 0; i < wrk->nr_ctxs; i += 2) {
1602 		struct ctx *ctx = &wrk->ctx_list[i];
1603 		uint32_t ctx_id, share_vm = 0;
1604 
1605 		if (ctx->id)
1606 			continue;
1607 
1608 		if ((flags & I915) || ctx->engine_map) {
1609 			struct drm_i915_gem_context_create_ext_setparam ext = {
1610 				.base.name = I915_CONTEXT_CREATE_EXT_SETPARAM,
1611 				.param.param = I915_CONTEXT_PARAM_VM,
1612 			};
1613 			struct drm_i915_gem_context_create_ext args = { };
1614 
1615 			/* Find existing context to share ppgtt with. */
1616 			for (j = 0; j < wrk->nr_ctxs; j++) {
1617 				struct drm_i915_gem_context_param param = {
1618 					.param = I915_CONTEXT_PARAM_VM,
1619 				};
1620 
1621 				if (!wrk->ctx_list[j].id)
1622 					continue;
1623 
1624 				param.ctx_id = wrk->ctx_list[j].id;
1625 
1626 				gem_context_get_param(fd, &param);
1627 				igt_assert(param.value);
1628 
1629 				share_vm = param.value;
1630 
1631 				ext.param.value = share_vm;
1632 				args.flags =
1633 				    I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS;
1634 				args.extensions = to_user_pointer(&ext);
1635 				break;
1636 			}
1637 
1638 			if ((!ctx->engine_map && !ctx->targets_instance) ||
1639 			    (ctx->engine_map && ctx->wants_balance))
1640 				args.flags |=
1641 				     I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE;
1642 
1643 			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT,
1644 				 &args);
1645 
1646 			ctx_id = args.ctx_id;
1647 		} else {
1648 			struct drm_i915_gem_context_create args = {};
1649 
1650 			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &args);
1651 			ctx_id = args.ctx_id;
1652 		}
1653 
1654 		igt_assert(ctx_id);
1655 		ctx->id = ctx_id;
1656 		ctx->sseu = device_sseu.slice_mask;
1657 
1658 		if (flags & GLOBAL_BALANCE) {
1659 			ctx->static_vcs = context_vcs_rr;
1660 			context_vcs_rr ^= 1;
1661 		} else {
1662 			ctx->static_vcs = ctx_vcs;
1663 			ctx_vcs ^= 1;
1664 		}
1665 
1666 		__ctx_set_prio(ctx_id, wrk->prio);
1667 
1668 		/*
1669 		 * Do we need a separate context to satisfy this workloads which
1670 		 * both want to target specific engines and be balanced by i915?
1671 		 */
1672 		if ((flags & I915) && ctx->wants_balance &&
1673 		    ctx->targets_instance && !ctx->engine_map) {
1674 			struct drm_i915_gem_context_create_ext_setparam ext = {
1675 				.base.name = I915_CONTEXT_CREATE_EXT_SETPARAM,
1676 				.param.param = I915_CONTEXT_PARAM_VM,
1677 				.param.value = share_vm,
1678 			};
1679 			struct drm_i915_gem_context_create_ext args = {
1680 				.extensions = to_user_pointer(&ext),
1681 				.flags =
1682 				    I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS |
1683 				    I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE,
1684 			};
1685 
1686 			igt_assert(share_vm);
1687 
1688 			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT,
1689 				 &args);
1690 
1691 			igt_assert(args.ctx_id);
1692 			ctx_id = args.ctx_id;
1693 			wrk->ctx_list[i + 1].id = args.ctx_id;
1694 
1695 			__ctx_set_prio(ctx_id, wrk->prio);
1696 		}
1697 
1698 		if (ctx->engine_map) {
1699 			struct i915_context_param_engines *set_engines =
1700 				alloca0(sizeof_param_engines(ctx->engine_map_count + 1));
1701 			struct i915_context_engines_load_balance *load_balance =
1702 				alloca0(sizeof_load_balance(ctx->engine_map_count));
1703 			struct drm_i915_gem_context_param param = {
1704 				.ctx_id = ctx_id,
1705 				.param = I915_CONTEXT_PARAM_ENGINES,
1706 				.size = sizeof_param_engines(ctx->engine_map_count + 1),
1707 				.value = to_user_pointer(set_engines),
1708 			};
1709 			struct i915_context_engines_bond *last = NULL;
1710 
1711 			if (ctx->wants_balance) {
1712 				set_engines->extensions =
1713 					to_user_pointer(load_balance);
1714 
1715 				load_balance->base.name =
1716 					I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
1717 				load_balance->num_siblings =
1718 					ctx->engine_map_count;
1719 
1720 				for (j = 0; j < ctx->engine_map_count; j++)
1721 					load_balance->engines[j] =
1722 						get_engine(ctx->engine_map[j]);
1723 			}
1724 
1725 			/* Reserve slot for virtual engine. */
1726 			set_engines->engines[0].engine_class =
1727 				I915_ENGINE_CLASS_INVALID;
1728 			set_engines->engines[0].engine_instance =
1729 				I915_ENGINE_CLASS_INVALID_NONE;
1730 
1731 			for (j = 1; j <= ctx->engine_map_count; j++)
1732 				set_engines->engines[j] =
1733 					get_engine(ctx->engine_map[j - 1]);
1734 
1735 			last = NULL;
1736 			for (j = 0; j < ctx->bond_count; j++) {
1737 				unsigned long mask = ctx->bonds[j].mask;
1738 				struct i915_context_engines_bond *bond =
1739 					alloca0(sizeof_engines_bond(__builtin_popcount(mask)));
1740 				unsigned int b, e;
1741 
1742 				bond->base.next_extension = to_user_pointer(last);
1743 				bond->base.name = I915_CONTEXT_ENGINES_EXT_BOND;
1744 
1745 				bond->virtual_index = 0;
1746 				bond->master = get_engine(ctx->bonds[j].master);
1747 
1748 				for (b = 0, e = 0; mask; e++, mask >>= 1) {
1749 					unsigned int idx;
1750 
1751 					if (!(mask & 1))
1752 						continue;
1753 
1754 					idx = find_engine(&set_engines->engines[1],
1755 							  ctx->engine_map_count,
1756 							  e);
1757 					bond->engines[b++] =
1758 						set_engines->engines[1 + idx];
1759 				}
1760 
1761 				last = bond;
1762 			}
1763 			load_balance->base.next_extension = to_user_pointer(last);
1764 
1765 			gem_context_set_param(fd, &param);
1766 		} else if (ctx->wants_balance) {
1767 			const unsigned int count = num_engines_in_class(VCS);
1768 			struct i915_context_engines_load_balance *load_balance =
1769 				alloca0(sizeof_load_balance(count));
1770 			struct i915_context_param_engines *set_engines =
1771 				alloca0(sizeof_param_engines(count + 1));
1772 			struct drm_i915_gem_context_param param = {
1773 				.ctx_id = ctx_id,
1774 				.param = I915_CONTEXT_PARAM_ENGINES,
1775 				.size = sizeof_param_engines(count + 1),
1776 				.value = to_user_pointer(set_engines),
1777 			};
1778 
1779 			set_engines->extensions = to_user_pointer(load_balance);
1780 
1781 			set_engines->engines[0].engine_class =
1782 				I915_ENGINE_CLASS_INVALID;
1783 			set_engines->engines[0].engine_instance =
1784 				I915_ENGINE_CLASS_INVALID_NONE;
1785 			fill_engines_class(&set_engines->engines[1], VCS);
1786 
1787 			load_balance->base.name =
1788 				I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
1789 			load_balance->num_siblings = count;
1790 
1791 			fill_engines_class(&load_balance->engines[0], VCS);
1792 
1793 			gem_context_set_param(fd, &param);
1794 		}
1795 
1796 		if (wrk->sseu) {
1797 			/* Set to slice 0 only, one slice. */
1798 			ctx->sseu = set_ctx_sseu(ctx, 1);
1799 		}
1800 
1801 		if (share_vm)
1802 			vm_destroy(fd, share_vm);
1803 	}
1804 
1805 	/* Record default preemption. */
1806 	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1807 		if (w->type == BATCH)
1808 			w->preempt_us = 100;
1809 	}
1810 
1811 	/*
1812 	 * Scan for contexts with modified preemption config and record their
1813 	 * preemption period for the following steps belonging to the same
1814 	 * context.
1815 	 */
1816 	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1817 		struct w_step *w2;
1818 
1819 		if (w->type != PREEMPTION)
1820 			continue;
1821 
1822 		for (j = i + 1; j < wrk->nr_steps; j++) {
1823 			w2 = &wrk->steps[j];
1824 
1825 			if (w2->context != w->context)
1826 				continue;
1827 			else if (w2->type == PREEMPTION)
1828 				break;
1829 			else if (w2->type != BATCH)
1830 				continue;
1831 
1832 			w2->preempt_us = w->period;
1833 		}
1834 	}
1835 
1836 	/*
1837 	 * Scan for SSEU control steps.
1838 	 */
1839 	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1840 		if (w->type == SSEU) {
1841 			get_device_sseu();
1842 			break;
1843 		}
1844 	}
1845 
1846 	/*
1847 	 * Allocate batch buffers.
1848 	 */
1849 	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
1850 		unsigned int _flags = flags;
1851 		enum intel_engine_id engine = w->engine;
1852 
1853 		if (w->type != BATCH)
1854 			continue;
1855 
1856 		if (engine == VCS)
1857 			_flags &= ~SWAPVCS;
1858 
1859 		alloc_step_batch(wrk, w, _flags);
1860 	}
1861 
1862 	return 0;
1863 }
1864 
elapsed(const struct timespec * start,const struct timespec * end)1865 static double elapsed(const struct timespec *start, const struct timespec *end)
1866 {
1867 	return (end->tv_sec - start->tv_sec) +
1868 	       (end->tv_nsec - start->tv_nsec) / 1e9;
1869 }
1870 
elapsed_us(const struct timespec * start,const struct timespec * end)1871 static int elapsed_us(const struct timespec *start, const struct timespec *end)
1872 {
1873 	return elapsed(start, end) * 1e6;
1874 }
1875 
get_vcs_engine(unsigned int n)1876 static enum intel_engine_id get_vcs_engine(unsigned int n)
1877 {
1878 	const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 };
1879 
1880 	igt_assert(n < ARRAY_SIZE(vcs_engines));
1881 
1882 	return vcs_engines[n];
1883 }
1884 
new_seqno(struct workload * wrk,enum intel_engine_id engine)1885 static uint32_t new_seqno(struct workload *wrk, enum intel_engine_id engine)
1886 {
1887 	uint32_t seqno;
1888 	int ret;
1889 
1890 	if (wrk->flags & GLOBAL_BALANCE) {
1891 		igt_assert(wrk->global_wrk);
1892 		wrk = wrk->global_wrk;
1893 
1894 		ret = pthread_mutex_lock(&wrk->mutex);
1895 		igt_assert(ret == 0);
1896 	}
1897 
1898 	seqno = ++wrk->seqno[engine];
1899 
1900 	if (wrk->flags & GLOBAL_BALANCE) {
1901 		ret = pthread_mutex_unlock(&wrk->mutex);
1902 		igt_assert(ret == 0);
1903 	}
1904 
1905 	return seqno;
1906 }
1907 
1908 static uint32_t
current_seqno(struct workload * wrk,enum intel_engine_id engine)1909 current_seqno(struct workload *wrk, enum intel_engine_id engine)
1910 {
1911 	if (wrk->flags & GLOBAL_BALANCE)
1912 		return wrk->global_wrk->seqno[engine];
1913 	else
1914 		return wrk->seqno[engine];
1915 }
1916 
1917 static uint32_t
read_status_page(struct workload * wrk,unsigned int idx)1918 read_status_page(struct workload *wrk, unsigned int idx)
1919 {
1920 	if (wrk->flags & GLOBAL_BALANCE)
1921 		return READ_ONCE(wrk->global_wrk->status_page[idx]);
1922 	else
1923 		return READ_ONCE(wrk->status_page[idx]);
1924 }
1925 
1926 static uint32_t
current_gpu_seqno(struct workload * wrk,enum intel_engine_id engine)1927 current_gpu_seqno(struct workload *wrk, enum intel_engine_id engine)
1928 {
1929        return read_status_page(wrk, SEQNO_IDX(engine));
1930 }
1931 
1932 struct workload_balancer {
1933 	unsigned int id;
1934 	const char *name;
1935 	const char *desc;
1936 	unsigned int flags;
1937 	unsigned int min_gen;
1938 
1939 	int (*init)(const struct workload_balancer *balancer,
1940 		    struct workload *wrk);
1941 	unsigned int (*get_qd)(const struct workload_balancer *balancer,
1942 			       struct workload *wrk,
1943 			       enum intel_engine_id engine);
1944 	enum intel_engine_id (*balance)(const struct workload_balancer *balancer,
1945 					struct workload *wrk, struct w_step *w);
1946 };
1947 
1948 static enum intel_engine_id
rr_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)1949 rr_balance(const struct workload_balancer *balancer,
1950 	   struct workload *wrk, struct w_step *w)
1951 {
1952 	unsigned int engine;
1953 
1954 	engine = get_vcs_engine(wrk->vcs_rr);
1955 	wrk->vcs_rr ^= 1;
1956 
1957 	return engine;
1958 }
1959 
1960 static enum intel_engine_id
rand_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)1961 rand_balance(const struct workload_balancer *balancer,
1962 	     struct workload *wrk, struct w_step *w)
1963 {
1964 	return get_vcs_engine(hars_petruska_f54_1_random(&wrk->prng) & 1);
1965 }
1966 
1967 static unsigned int
get_qd_depth(const struct workload_balancer * balancer,struct workload * wrk,enum intel_engine_id engine)1968 get_qd_depth(const struct workload_balancer *balancer,
1969 	     struct workload *wrk, enum intel_engine_id engine)
1970 {
1971 	return current_seqno(wrk, engine) - current_gpu_seqno(wrk, engine);
1972 }
1973 
1974 static enum intel_engine_id
__qd_select_engine(struct workload * wrk,const unsigned long * qd,bool random)1975 __qd_select_engine(struct workload *wrk, const unsigned long *qd, bool random)
1976 {
1977 	unsigned int n;
1978 
1979 	if (qd[VCS1] < qd[VCS2])
1980 		n = 0;
1981 	else if (qd[VCS1] > qd[VCS2])
1982 		n = 1;
1983 	else if (random)
1984 		n = hars_petruska_f54_1_random(&wrk->prng) & 1;
1985 	else
1986 		n = wrk->vcs_rr;
1987 	wrk->vcs_rr = n ^ 1;
1988 
1989 	return get_vcs_engine(n);
1990 }
1991 
1992 static enum intel_engine_id
__qd_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w,bool random)1993 __qd_balance(const struct workload_balancer *balancer,
1994 	     struct workload *wrk, struct w_step *w, bool random)
1995 {
1996 	enum intel_engine_id engine;
1997 	unsigned long qd[NUM_ENGINES];
1998 
1999 	igt_assert(w->engine == VCS);
2000 
2001 	qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
2002 	wrk->qd_sum[VCS1] += qd[VCS1];
2003 
2004 	qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
2005 	wrk->qd_sum[VCS2] += qd[VCS2];
2006 
2007 	engine = __qd_select_engine(wrk, qd, random);
2008 
2009 #ifdef DEBUG
2010 	printf("qd_balance[%u]: 1:%ld 2:%ld rr:%u = %u\t(%u - %u) (%u - %u)\n",
2011 	       wrk->id, qd[VCS1], qd[VCS2], wrk->vcs_rr, engine,
2012 	       current_seqno(wrk, VCS1), current_gpu_seqno(wrk, VCS1),
2013 	       current_seqno(wrk, VCS2), current_gpu_seqno(wrk, VCS2));
2014 #endif
2015 	return engine;
2016 }
2017 
2018 static enum intel_engine_id
qd_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2019 qd_balance(const struct workload_balancer *balancer,
2020 	     struct workload *wrk, struct w_step *w)
2021 {
2022 	return __qd_balance(balancer, wrk, w, false);
2023 }
2024 
2025 static enum intel_engine_id
qdr_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2026 qdr_balance(const struct workload_balancer *balancer,
2027 	     struct workload *wrk, struct w_step *w)
2028 {
2029 	return __qd_balance(balancer, wrk, w, true);
2030 }
2031 
2032 static enum intel_engine_id
qdavg_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2033 qdavg_balance(const struct workload_balancer *balancer,
2034 	     struct workload *wrk, struct w_step *w)
2035 {
2036 	unsigned long qd[NUM_ENGINES];
2037 	unsigned int engine;
2038 
2039 	igt_assert(w->engine == VCS);
2040 
2041 	for (engine = VCS1; engine <= VCS2; engine++) {
2042 		qd[engine] = balancer->get_qd(balancer, wrk, engine);
2043 		wrk->qd_sum[engine] += qd[engine];
2044 
2045 		ewma_rt_add(&wrk->rt.avg[engine], qd[engine]);
2046 		qd[engine] = ewma_rt_read(&wrk->rt.avg[engine]);
2047 	}
2048 
2049 	engine = __qd_select_engine(wrk, qd, false);
2050 #ifdef DEBUG
2051 	printf("qdavg_balance[%u]: 1:%ld 2:%ld rr:%u = %u\t(%u - %u) (%u - %u)\n",
2052 	       wrk->id, qd[VCS1], qd[VCS2], wrk->vcs_rr, engine,
2053 	       current_seqno(wrk, VCS1), current_gpu_seqno(wrk, VCS1),
2054 	       current_seqno(wrk, VCS2), current_gpu_seqno(wrk, VCS2));
2055 #endif
2056 	return engine;
2057 }
2058 
2059 static enum intel_engine_id
__rt_select_engine(struct workload * wrk,unsigned long * qd,bool random)2060 __rt_select_engine(struct workload *wrk, unsigned long *qd, bool random)
2061 {
2062 	qd[VCS1] >>= 10;
2063 	qd[VCS2] >>= 10;
2064 
2065 	return __qd_select_engine(wrk, qd, random);
2066 }
2067 
2068 struct rt_depth {
2069 	uint32_t seqno;
2070 	uint32_t submitted;
2071 	uint32_t completed;
2072 };
2073 
get_rt_depth(struct workload * wrk,unsigned int engine,struct rt_depth * rt)2074 static void get_rt_depth(struct workload *wrk,
2075 			 unsigned int engine,
2076 			 struct rt_depth *rt)
2077 {
2078 	const unsigned int idx = SEQNO_IDX(engine);
2079 	uint32_t latch;
2080 
2081 	do {
2082 		latch = read_status_page(wrk, idx + 3);
2083 		rt->submitted = read_status_page(wrk, idx + 1);
2084 		rt->completed = read_status_page(wrk, idx + 2);
2085 		rt->seqno = read_status_page(wrk, idx);
2086 	} while (latch != rt->seqno);
2087 }
2088 
2089 static enum intel_engine_id
__rt_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w,bool random)2090 __rt_balance(const struct workload_balancer *balancer,
2091 	     struct workload *wrk, struct w_step *w, bool random)
2092 {
2093 	unsigned long qd[NUM_ENGINES];
2094 	unsigned int engine;
2095 
2096 	igt_assert(w->engine == VCS);
2097 
2098 	/* Estimate the "speed" of the most recent batch
2099 	 *    (finish time - submit time)
2100 	 * and use that as an approximate for the total remaining time for
2101 	 * all batches on that engine, plus the time we expect this batch to
2102 	 * take. We try to keep the total balanced between the engines.
2103 	 */
2104 	for (engine = VCS1; engine <= VCS2; engine++) {
2105 		struct rt_depth rt;
2106 
2107 		get_rt_depth(wrk, engine, &rt);
2108 		qd[engine] = current_seqno(wrk, engine) - rt.seqno;
2109 		wrk->qd_sum[engine] += qd[engine];
2110 		qd[engine] = (qd[engine] + 1) * (rt.completed - rt.submitted);
2111 #ifdef DEBUG
2112 		printf("rt[0] = %d (%d - %d) x %d (%d - %d) = %ld\n",
2113 		       current_seqno(wrk, engine) - rt.seqno,
2114 		       current_seqno(wrk, engine), rt.seqno,
2115 		       rt.completed - rt.submitted,
2116 		       rt.completed, rt.submitted,
2117 		       qd[engine]);
2118 #endif
2119 	}
2120 
2121 	return __rt_select_engine(wrk, qd, random);
2122 }
2123 
2124 static enum intel_engine_id
rt_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2125 rt_balance(const struct workload_balancer *balancer,
2126 	   struct workload *wrk, struct w_step *w)
2127 {
2128 
2129 	return __rt_balance(balancer, wrk, w, false);
2130 }
2131 
2132 static enum intel_engine_id
rtr_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2133 rtr_balance(const struct workload_balancer *balancer,
2134 	   struct workload *wrk, struct w_step *w)
2135 {
2136 	return __rt_balance(balancer, wrk, w, true);
2137 }
2138 
2139 static enum intel_engine_id
rtavg_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2140 rtavg_balance(const struct workload_balancer *balancer,
2141 	   struct workload *wrk, struct w_step *w)
2142 {
2143 	unsigned long qd[NUM_ENGINES];
2144 	unsigned int engine;
2145 
2146 	igt_assert(w->engine == VCS);
2147 
2148 	/* Estimate the average "speed" of the most recent batches
2149 	 *    (finish time - submit time)
2150 	 * and use that as an approximate for the total remaining time for
2151 	 * all batches on that engine plus the time we expect to execute in.
2152 	 * We try to keep the total remaining balanced between the engines.
2153 	 */
2154 	for (engine = VCS1; engine <= VCS2; engine++) {
2155 		struct rt_depth rt;
2156 
2157 		get_rt_depth(wrk, engine, &rt);
2158 		if (rt.seqno != wrk->rt.last[engine]) {
2159 			igt_assert((long)(rt.completed - rt.submitted) > 0);
2160 			ewma_rt_add(&wrk->rt.avg[engine],
2161 				    rt.completed - rt.submitted);
2162 			wrk->rt.last[engine] = rt.seqno;
2163 		}
2164 		qd[engine] = current_seqno(wrk, engine) - rt.seqno;
2165 		wrk->qd_sum[engine] += qd[engine];
2166 		qd[engine] =
2167 			(qd[engine] + 1) * ewma_rt_read(&wrk->rt.avg[engine]);
2168 
2169 #ifdef DEBUG
2170 		printf("rtavg[%d] = %d (%d - %d) x %ld (%d) = %ld\n",
2171 		       engine,
2172 		       current_seqno(wrk, engine) - rt.seqno,
2173 		       current_seqno(wrk, engine), rt.seqno,
2174 		       ewma_rt_read(&wrk->rt.avg[engine]),
2175 		       rt.completed - rt.submitted,
2176 		       qd[engine]);
2177 #endif
2178 	}
2179 
2180 	return __rt_select_engine(wrk, qd, false);
2181 }
2182 
2183 static enum intel_engine_id
context_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2184 context_balance(const struct workload_balancer *balancer,
2185 		struct workload *wrk, struct w_step *w)
2186 {
2187 	return get_vcs_engine(__get_ctx(wrk, w)->static_vcs);
2188 }
2189 
2190 static unsigned int
get_engine_busy(const struct workload_balancer * balancer,struct workload * wrk,enum intel_engine_id engine)2191 get_engine_busy(const struct workload_balancer *balancer,
2192 		struct workload *wrk, enum intel_engine_id engine)
2193 {
2194 	struct busy_balancer *bb = &wrk->busy_balancer;
2195 
2196 	if (engine == VCS2 && (wrk->flags & VCS2REMAP))
2197 		engine = BCS;
2198 
2199 	return bb->busy[bb->engine_map[engine]];
2200 }
2201 
2202 static void
get_pmu_stats(const struct workload_balancer * b,struct workload * wrk)2203 get_pmu_stats(const struct workload_balancer *b, struct workload *wrk)
2204 {
2205 	struct busy_balancer *bb = &wrk->busy_balancer;
2206 	uint64_t val[7];
2207 	unsigned int i;
2208 
2209 	igt_assert_eq(read(bb->fd, val, sizeof(val)),
2210 		      (2 + bb->num_engines) * sizeof(uint64_t));
2211 
2212 	if (!bb->first) {
2213 		for (i = 0; i < bb->num_engines; i++) {
2214 			double d;
2215 
2216 			d = (val[2 + i] - bb->prev[i]) * 100;
2217 			d /= val[1] - bb->t_prev;
2218 			bb->busy[i] = d;
2219 		}
2220 	}
2221 
2222 	for (i = 0; i < bb->num_engines; i++)
2223 		bb->prev[i] = val[2 + i];
2224 
2225 	bb->t_prev = val[1];
2226 	bb->first = false;
2227 }
2228 
2229 static enum intel_engine_id
busy_avg_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2230 busy_avg_balance(const struct workload_balancer *balancer,
2231 		 struct workload *wrk, struct w_step *w)
2232 {
2233 	get_pmu_stats(balancer, wrk);
2234 
2235 	return qdavg_balance(balancer, wrk, w);
2236 }
2237 
2238 static enum intel_engine_id
busy_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2239 busy_balance(const struct workload_balancer *balancer,
2240 	     struct workload *wrk, struct w_step *w)
2241 {
2242 	get_pmu_stats(balancer, wrk);
2243 
2244 	return qd_balance(balancer, wrk, w);
2245 }
2246 
2247 static int
busy_init(const struct workload_balancer * balancer,struct workload * wrk)2248 busy_init(const struct workload_balancer *balancer, struct workload *wrk)
2249 {
2250 	struct busy_balancer *bb = &wrk->busy_balancer;
2251 	struct engine_desc {
2252 		unsigned class, inst;
2253 		enum intel_engine_id id;
2254 	} *d, engines[] = {
2255 		{ I915_ENGINE_CLASS_RENDER, 0, RCS },
2256 		{ I915_ENGINE_CLASS_COPY, 0, BCS },
2257 		{ I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
2258 		{ I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
2259 		{ I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
2260 		{ 0, 0, VCS }
2261 	};
2262 
2263 	bb->num_engines = 0;
2264 	bb->first = true;
2265 	bb->fd = -1;
2266 
2267 	for (d = &engines[0]; d->id != VCS; d++) {
2268 		int pfd;
2269 
2270 		pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
2271 							        d->inst),
2272 					   bb->fd);
2273 		if (pfd < 0) {
2274 			if (d->id != VCS2)
2275 				return -(10 + bb->num_engines);
2276 			else
2277 				continue;
2278 		}
2279 
2280 		if (bb->num_engines == 0)
2281 			bb->fd = pfd;
2282 
2283 		bb->engine_map[d->id] = bb->num_engines++;
2284 	}
2285 
2286 	if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
2287 		return -1;
2288 
2289 	return 0;
2290 }
2291 
2292 static const struct workload_balancer all_balancers[] = {
2293 	{
2294 		.id = 0,
2295 		.name = "rr",
2296 		.desc = "Simple round-robin.",
2297 		.balance = rr_balance,
2298 	},
2299 	{
2300 		.id = 6,
2301 		.name = "rand",
2302 		.desc = "Random selection.",
2303 		.balance = rand_balance,
2304 	},
2305 	{
2306 		.id = 1,
2307 		.name = "qd",
2308 		.desc = "Queue depth estimation with round-robin on equal depth.",
2309 		.flags = SEQNO,
2310 		.min_gen = 8,
2311 		.get_qd = get_qd_depth,
2312 		.balance = qd_balance,
2313 	},
2314 	{
2315 		.id = 5,
2316 		.name = "qdr",
2317 		.desc = "Queue depth estimation with random selection on equal depth.",
2318 		.flags = SEQNO,
2319 		.min_gen = 8,
2320 		.get_qd = get_qd_depth,
2321 		.balance = qdr_balance,
2322 	},
2323 	{
2324 		.id = 7,
2325 		.name = "qdavg",
2326 		.desc = "Like qd, but using an average queue depth estimator.",
2327 		.flags = SEQNO,
2328 		.min_gen = 8,
2329 		.get_qd = get_qd_depth,
2330 		.balance = qdavg_balance,
2331 	},
2332 	{
2333 		.id = 2,
2334 		.name = "rt",
2335 		.desc = "Queue depth plus last runtime estimation.",
2336 		.flags = SEQNO | RT,
2337 		.min_gen = 8,
2338 		.get_qd = get_qd_depth,
2339 		.balance = rt_balance,
2340 	},
2341 	{
2342 		.id = 3,
2343 		.name = "rtr",
2344 		.desc = "Like rt but with random engine selection on equal depth.",
2345 		.flags = SEQNO | RT,
2346 		.min_gen = 8,
2347 		.get_qd = get_qd_depth,
2348 		.balance = rtr_balance,
2349 	},
2350 	{
2351 		.id = 4,
2352 		.name = "rtavg",
2353 		.desc = "Improved version rt tracking average execution speed per engine.",
2354 		.flags = SEQNO | RT,
2355 		.min_gen = 8,
2356 		.get_qd = get_qd_depth,
2357 		.balance = rtavg_balance,
2358 	},
2359 	{
2360 		.id = 8,
2361 		.name = "context",
2362 		.desc = "Static round-robin VCS assignment at context creation.",
2363 		.balance = context_balance,
2364 	},
2365 	{
2366 		.id = 9,
2367 		.name = "busy",
2368 		.desc = "Engine busyness based balancing.",
2369 		.init = busy_init,
2370 		.get_qd = get_engine_busy,
2371 		.balance = busy_balance,
2372 	},
2373 	{
2374 		.id = 10,
2375 		.name = "busy-avg",
2376 		.desc = "Average engine busyness based balancing.",
2377 		.init = busy_init,
2378 		.get_qd = get_engine_busy,
2379 		.balance = busy_avg_balance,
2380 	},
2381 	{
2382 		.id = 11,
2383 		.name = "i915",
2384 		.desc = "i915 balancing.",
2385 		.flags = I915,
2386 	},
2387 };
2388 
2389 static unsigned int
global_get_qd(const struct workload_balancer * balancer,struct workload * wrk,enum intel_engine_id engine)2390 global_get_qd(const struct workload_balancer *balancer,
2391 	      struct workload *wrk, enum intel_engine_id engine)
2392 {
2393 	igt_assert(wrk->global_wrk);
2394 	igt_assert(wrk->global_balancer);
2395 
2396 	return wrk->global_balancer->get_qd(wrk->global_balancer,
2397 					    wrk->global_wrk, engine);
2398 }
2399 
2400 static enum intel_engine_id
global_balance(const struct workload_balancer * balancer,struct workload * wrk,struct w_step * w)2401 global_balance(const struct workload_balancer *balancer,
2402 	       struct workload *wrk, struct w_step *w)
2403 {
2404 	enum intel_engine_id engine;
2405 	int ret;
2406 
2407 	igt_assert(wrk->global_wrk);
2408 	igt_assert(wrk->global_balancer);
2409 
2410 	wrk = wrk->global_wrk;
2411 
2412 	ret = pthread_mutex_lock(&wrk->mutex);
2413 	igt_assert(ret == 0);
2414 
2415 	engine = wrk->global_balancer->balance(wrk->global_balancer, wrk, w);
2416 
2417 	ret = pthread_mutex_unlock(&wrk->mutex);
2418 	igt_assert(ret == 0);
2419 
2420 	return engine;
2421 }
2422 
2423 static const struct workload_balancer global_balancer = {
2424 		.id = ~0,
2425 		.name = "global",
2426 		.desc = "Global balancer",
2427 		.get_qd = global_get_qd,
2428 		.balance = global_balance,
2429 	};
2430 
2431 static void
update_bb_seqno(struct w_step * w,enum intel_engine_id engine,uint32_t seqno)2432 update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
2433 {
2434 	gem_set_domain(fd, w->bb_handle,
2435 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
2436 
2437 	w->reloc[0].delta = SEQNO_OFFSET(engine);
2438 
2439 	*w->seqno_value = seqno;
2440 	*w->seqno_address = w->reloc[0].presumed_offset + w->reloc[0].delta;
2441 
2442 	/* If not using NO_RELOC, force the relocations */
2443 	if (!(w->eb.flags & I915_EXEC_NO_RELOC))
2444 		w->reloc[0].presumed_offset = -1;
2445 }
2446 
2447 static void
update_bb_rt(struct w_step * w,enum intel_engine_id engine,uint32_t seqno)2448 update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
2449 {
2450 	gem_set_domain(fd, w->bb_handle,
2451 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
2452 
2453 	w->reloc[1].delta = SEQNO_OFFSET(engine) + sizeof(uint32_t);
2454 	w->reloc[2].delta = SEQNO_OFFSET(engine) + 2 * sizeof(uint32_t);
2455 	w->reloc[3].delta = SEQNO_OFFSET(engine) + 3 * sizeof(uint32_t);
2456 
2457 	*w->latch_value = seqno;
2458 	*w->latch_address = w->reloc[3].presumed_offset + w->reloc[3].delta;
2459 
2460 	*w->rt0_value = *REG(RCS_TIMESTAMP);
2461 	*w->rt0_address = w->reloc[1].presumed_offset + w->reloc[1].delta;
2462 	*w->rt1_address = w->reloc[2].presumed_offset + w->reloc[2].delta;
2463 
2464 	/* If not using NO_RELOC, force the relocations */
2465 	if (!(w->eb.flags & I915_EXEC_NO_RELOC)) {
2466 		w->reloc[1].presumed_offset = -1;
2467 		w->reloc[2].presumed_offset = -1;
2468 		w->reloc[3].presumed_offset = -1;
2469 	}
2470 }
2471 
2472 static void
update_bb_start(struct w_step * w)2473 update_bb_start(struct w_step *w)
2474 {
2475 	if (!w->unbound_duration)
2476 		return;
2477 
2478 	gem_set_domain(fd, w->bb_handle,
2479 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
2480 
2481 	*w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
2482 }
2483 
w_sync_to(struct workload * wrk,struct w_step * w,int target)2484 static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
2485 {
2486 	if (target < 0)
2487 		target = wrk->nr_steps + target;
2488 
2489 	igt_assert(target < wrk->nr_steps);
2490 
2491 	while (wrk->steps[target].type != BATCH) {
2492 		if (--target < 0)
2493 			target = wrk->nr_steps + target;
2494 	}
2495 
2496 	igt_assert(target < wrk->nr_steps);
2497 	igt_assert(wrk->steps[target].type == BATCH);
2498 
2499 	gem_sync(fd, wrk->steps[target].obj[0].handle);
2500 }
2501 
get_status_cs(struct workload * wrk)2502 static uint32_t *get_status_cs(struct workload *wrk)
2503 {
2504 	return wrk->status_cs;
2505 }
2506 
2507 #define INIT_CLOCKS 0x1
2508 #define INIT_ALL (INIT_CLOCKS)
init_status_page(struct workload * wrk,unsigned int flags)2509 static void init_status_page(struct workload *wrk, unsigned int flags)
2510 {
2511 	struct drm_i915_gem_relocation_entry reloc[4] = {};
2512 	struct drm_i915_gem_exec_object2 *status_object =
2513 						get_status_objects(wrk);
2514 	struct drm_i915_gem_execbuffer2 eb = {
2515 		.buffer_count = ARRAY_SIZE(wrk->status_object),
2516 		.buffers_ptr = to_user_pointer(status_object)
2517 	};
2518 	uint32_t *base = get_status_cs(wrk);
2519 
2520 	/* Want to make sure that the balancer has a reasonable view of
2521 	 * the background busyness of each engine. To do that we occasionally
2522 	 * send a dummy batch down the pipeline.
2523 	 */
2524 
2525 	if (!base)
2526 		return;
2527 
2528 	gem_set_domain(fd, status_object[1].handle,
2529 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
2530 
2531 	status_object[1].relocs_ptr = to_user_pointer(reloc);
2532 	status_object[1].relocation_count = 2;
2533 	if (flags & INIT_CLOCKS)
2534 		status_object[1].relocation_count += 2;
2535 
2536 	for (int engine = 0; engine < NUM_ENGINES; engine++) {
2537 		struct drm_i915_gem_relocation_entry *r = reloc;
2538 		uint64_t presumed_offset = status_object[0].offset;
2539 		uint32_t offset = engine * 128;
2540 		uint32_t *cs = base + offset / sizeof(*cs);
2541 		uint64_t addr;
2542 
2543 		r->offset = offset + sizeof(uint32_t);
2544 		r->delta = SEQNO_OFFSET(engine);
2545 		r->presumed_offset = presumed_offset;
2546 		addr = presumed_offset + r->delta;
2547 		r++;
2548 		*cs++ = MI_STORE_DWORD_IMM;
2549 		*cs++ = addr;
2550 		*cs++ = addr >> 32;
2551 		*cs++ = new_seqno(wrk, engine);
2552 		offset += 4 * sizeof(uint32_t);
2553 
2554 		/* When we are busy, we can just reuse the last set of timings.
2555 		 * If we have been idle for a while, we want to resample the
2556 		 * latency on each engine (to measure external load).
2557 		 */
2558 		if (flags & INIT_CLOCKS) {
2559 			r->offset = offset + sizeof(uint32_t);
2560 			r->delta = SEQNO_OFFSET(engine) + sizeof(uint32_t);
2561 			r->presumed_offset = presumed_offset;
2562 			addr = presumed_offset + r->delta;
2563 			r++;
2564 			*cs++ = MI_STORE_DWORD_IMM;
2565 			*cs++ = addr;
2566 			*cs++ = addr >> 32;
2567 			*cs++ = *REG(RCS_TIMESTAMP);
2568 			offset += 4 * sizeof(uint32_t);
2569 
2570 			r->offset = offset + 2 * sizeof(uint32_t);
2571 			r->delta = SEQNO_OFFSET(engine) + 2*sizeof(uint32_t);
2572 			r->presumed_offset = presumed_offset;
2573 			addr = presumed_offset + r->delta;
2574 			r++;
2575 			*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
2576 			*cs++ = RCS_TIMESTAMP;
2577 			*cs++ = addr;
2578 			*cs++ = addr >> 32;
2579 			offset += 4 * sizeof(uint32_t);
2580 		}
2581 
2582 		r->offset = offset + sizeof(uint32_t);
2583 		r->delta = SEQNO_OFFSET(engine) + 3*sizeof(uint32_t);
2584 		r->presumed_offset = presumed_offset;
2585 		addr = presumed_offset + r->delta;
2586 		r++;
2587 		*cs++ = MI_STORE_DWORD_IMM;
2588 		*cs++ = addr;
2589 		*cs++ = addr >> 32;
2590 		*cs++ = current_seqno(wrk, engine);
2591 		offset += 4 * sizeof(uint32_t);
2592 
2593 		*cs++ = MI_BATCH_BUFFER_END;
2594 
2595 		eb_set_engine(&eb, engine, wrk->flags);
2596 		eb.flags |= I915_EXEC_HANDLE_LUT;
2597 		eb.flags |= I915_EXEC_NO_RELOC;
2598 
2599 		eb.batch_start_offset = 128 * engine;
2600 
2601 		gem_execbuf(fd, &eb);
2602 	}
2603 }
2604 
2605 static void
do_eb(struct workload * wrk,struct w_step * w,enum intel_engine_id engine,unsigned int flags)2606 do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
2607       unsigned int flags)
2608 {
2609 	uint32_t seqno = new_seqno(wrk, engine);
2610 	unsigned int i;
2611 
2612 	eb_update_flags(wrk, w, engine, flags);
2613 
2614 	if (flags & SEQNO)
2615 		update_bb_seqno(w, engine, seqno);
2616 	if (flags & RT)
2617 		update_bb_rt(w, engine, seqno);
2618 
2619 	update_bb_start(w);
2620 
2621 	w->eb.batch_start_offset =
2622 		w->unbound_duration ?
2623 		0 :
2624 		ALIGN(w->bb_sz - get_bb_sz(get_duration(wrk, w)),
2625 		      2 * sizeof(uint32_t));
2626 
2627 	for (i = 0; i < w->fence_deps.nr; i++) {
2628 		int tgt = w->idx + w->fence_deps.list[i];
2629 
2630 		/* TODO: fence merging needed to support multiple inputs */
2631 		igt_assert(i == 0);
2632 		igt_assert(tgt >= 0 && tgt < w->idx);
2633 		igt_assert(wrk->steps[tgt].emit_fence > 0);
2634 
2635 		if (w->fence_deps.submit_fence)
2636 			w->eb.flags |= I915_EXEC_FENCE_SUBMIT;
2637 		else
2638 			w->eb.flags |= I915_EXEC_FENCE_IN;
2639 
2640 		w->eb.rsvd2 = wrk->steps[tgt].emit_fence;
2641 	}
2642 
2643 	if (w->eb.flags & I915_EXEC_FENCE_OUT)
2644 		gem_execbuf_wr(fd, &w->eb);
2645 	else
2646 		gem_execbuf(fd, &w->eb);
2647 
2648 	if (w->eb.flags & I915_EXEC_FENCE_OUT) {
2649 		w->emit_fence = w->eb.rsvd2 >> 32;
2650 		igt_assert(w->emit_fence > 0);
2651 	}
2652 }
2653 
sync_deps(struct workload * wrk,struct w_step * w)2654 static bool sync_deps(struct workload *wrk, struct w_step *w)
2655 {
2656 	bool synced = false;
2657 	unsigned int i;
2658 
2659 	for (i = 0; i < w->data_deps.nr; i++) {
2660 		int dep_idx;
2661 
2662 		igt_assert(w->data_deps.list[i] <= 0);
2663 
2664 		if (!w->data_deps.list[i])
2665 			continue;
2666 
2667 		dep_idx = w->idx + w->data_deps.list[i];
2668 
2669 		igt_assert(dep_idx >= 0 && dep_idx < w->idx);
2670 		igt_assert(wrk->steps[dep_idx].type == BATCH);
2671 
2672 		gem_sync(fd, wrk->steps[dep_idx].obj[0].handle);
2673 
2674 		synced = true;
2675 	}
2676 
2677 	return synced;
2678 }
2679 
run_workload(void * data)2680 static void *run_workload(void *data)
2681 {
2682 	struct workload *wrk = (struct workload *)data;
2683 	struct timespec t_start, t_end;
2684 	struct w_step *w;
2685 	bool last_sync = false;
2686 	int throttle = -1;
2687 	int qd_throttle = -1;
2688 	int count;
2689 	int i;
2690 
2691 	clock_gettime(CLOCK_MONOTONIC, &t_start);
2692 
2693 	init_status_page(wrk, INIT_ALL);
2694 	for (count = 0; wrk->run && (wrk->background || count < wrk->repeat);
2695 	     count++) {
2696 		unsigned int cur_seqno = wrk->sync_seqno;
2697 
2698 		clock_gettime(CLOCK_MONOTONIC, &wrk->repeat_start);
2699 
2700 		for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
2701 		     i++, w++) {
2702 			enum intel_engine_id engine = w->engine;
2703 			int do_sleep = 0;
2704 
2705 			if (w->type == DELAY) {
2706 				do_sleep = w->delay;
2707 			} else if (w->type == PERIOD) {
2708 				struct timespec now;
2709 
2710 				clock_gettime(CLOCK_MONOTONIC, &now);
2711 				do_sleep = w->period -
2712 					   elapsed_us(&wrk->repeat_start, &now);
2713 				if (do_sleep < 0) {
2714 					if (verbose > 1)
2715 						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
2716 						       wrk->id, count, i, do_sleep);
2717 					continue;
2718 				}
2719 			} else if (w->type == SYNC) {
2720 				unsigned int s_idx = i + w->target;
2721 
2722 				igt_assert(s_idx >= 0 && s_idx < i);
2723 				igt_assert(wrk->steps[s_idx].type == BATCH);
2724 				gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
2725 				continue;
2726 			} else if (w->type == THROTTLE) {
2727 				throttle = w->throttle;
2728 				continue;
2729 			} else if (w->type == QD_THROTTLE) {
2730 				qd_throttle = w->throttle;
2731 				continue;
2732 			} else if (w->type == SW_FENCE) {
2733 				igt_assert(w->emit_fence < 0);
2734 				w->emit_fence =
2735 					sw_sync_timeline_create_fence(wrk->sync_timeline,
2736 								      cur_seqno + w->idx);
2737 				igt_assert(w->emit_fence > 0);
2738 				continue;
2739 			} else if (w->type == SW_FENCE_SIGNAL) {
2740 				int tgt = w->idx + w->target;
2741 				int inc;
2742 
2743 				igt_assert(tgt >= 0 && tgt < i);
2744 				igt_assert(wrk->steps[tgt].type == SW_FENCE);
2745 				cur_seqno += wrk->steps[tgt].idx;
2746 				inc = cur_seqno - wrk->sync_seqno;
2747 				sw_sync_timeline_inc(wrk->sync_timeline, inc);
2748 				continue;
2749 			} else if (w->type == CTX_PRIORITY) {
2750 				if (w->priority != wrk->ctx_list[w->context].priority) {
2751 					struct drm_i915_gem_context_param param = {
2752 						.ctx_id = wrk->ctx_list[w->context].id,
2753 						.param = I915_CONTEXT_PARAM_PRIORITY,
2754 						.value = w->priority,
2755 					};
2756 
2757 					gem_context_set_param(fd, &param);
2758 					wrk->ctx_list[w->context].priority =
2759 								    w->priority;
2760 				}
2761 				continue;
2762 			} else if (w->type == TERMINATE) {
2763 				unsigned int t_idx = i + w->target;
2764 
2765 				igt_assert(t_idx >= 0 && t_idx < i);
2766 				igt_assert(wrk->steps[t_idx].type == BATCH);
2767 				igt_assert(wrk->steps[t_idx].unbound_duration);
2768 
2769 				*wrk->steps[t_idx].recursive_bb_start =
2770 					MI_BATCH_BUFFER_END;
2771 				__sync_synchronize();
2772 				continue;
2773 			} else if (w->type == PREEMPTION ||
2774 				   w->type == ENGINE_MAP ||
2775 				   w->type == LOAD_BALANCE ||
2776 				   w->type == BOND) {
2777 				continue;
2778 			} else if (w->type == SSEU) {
2779 				if (w->sseu != wrk->ctx_list[w->context * 2].sseu) {
2780 					wrk->ctx_list[w->context * 2].sseu =
2781 						set_ctx_sseu(&wrk->ctx_list[w->context * 2],
2782 							     w->sseu);
2783 				}
2784 				continue;
2785 			}
2786 
2787 			if (do_sleep || w->type == PERIOD) {
2788 				usleep(do_sleep);
2789 				continue;
2790 			}
2791 
2792 			igt_assert(w->type == BATCH);
2793 
2794 			if ((wrk->flags & DEPSYNC) && engine == VCS)
2795 				last_sync = sync_deps(wrk, w);
2796 
2797 			if (last_sync && (wrk->flags & HEARTBEAT))
2798 				init_status_page(wrk, 0);
2799 
2800 			last_sync = false;
2801 
2802 			wrk->nr_bb[engine]++;
2803 			if (engine == VCS && wrk->balancer &&
2804 			    wrk->balancer->balance) {
2805 				engine = wrk->balancer->balance(wrk->balancer,
2806 								wrk, w);
2807 				wrk->nr_bb[engine]++;
2808 			}
2809 
2810 			if (throttle > 0)
2811 				w_sync_to(wrk, w, i - throttle);
2812 
2813 			do_eb(wrk, w, engine, wrk->flags);
2814 
2815 			if (w->request != -1) {
2816 				igt_list_del(&w->rq_link);
2817 				wrk->nrequest[w->request]--;
2818 			}
2819 			w->request = engine;
2820 			igt_list_add_tail(&w->rq_link, &wrk->requests[engine]);
2821 			wrk->nrequest[engine]++;
2822 
2823 			if (!wrk->run)
2824 				break;
2825 
2826 			if (w->sync) {
2827 				gem_sync(fd, w->obj[0].handle);
2828 				last_sync = true;
2829 			}
2830 
2831 			if (qd_throttle > 0) {
2832 				while (wrk->nrequest[engine] > qd_throttle) {
2833 					struct w_step *s;
2834 
2835 					s = igt_list_first_entry(&wrk->requests[engine],
2836 								 s, rq_link);
2837 
2838 					gem_sync(fd, s->obj[0].handle);
2839 					last_sync = true;
2840 
2841 					s->request = -1;
2842 					igt_list_del(&s->rq_link);
2843 					wrk->nrequest[engine]--;
2844 				}
2845 			}
2846 		}
2847 
2848 		if (wrk->sync_timeline) {
2849 			int inc;
2850 
2851 			inc = wrk->nr_steps - (cur_seqno - wrk->sync_seqno);
2852 			sw_sync_timeline_inc(wrk->sync_timeline, inc);
2853 			wrk->sync_seqno += wrk->nr_steps;
2854 		}
2855 
2856 		/* Cleanup all fences instantiated in this iteration. */
2857 		for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
2858 		     i++, w++) {
2859 			if (w->emit_fence > 0) {
2860 				close(w->emit_fence);
2861 				w->emit_fence = -1;
2862 			}
2863 		}
2864 	}
2865 
2866 	for (i = 0; i < NUM_ENGINES; i++) {
2867 		if (!wrk->nrequest[i])
2868 			continue;
2869 
2870 		w = igt_list_last_entry(&wrk->requests[i], w, rq_link);
2871 		gem_sync(fd, w->obj[0].handle);
2872 	}
2873 
2874 	clock_gettime(CLOCK_MONOTONIC, &t_end);
2875 
2876 	if (wrk->print_stats) {
2877 		double t = elapsed(&t_start, &t_end);
2878 
2879 		printf("%c%u: %.3fs elapsed (%d cycles, %.3f workloads/s).",
2880 		       wrk->background ? ' ' : '*', wrk->id,
2881 		       t, count, count / t);
2882 		if (wrk->balancer)
2883 			printf(" %lu (%lu + %lu) total VCS batches.",
2884 			       wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]);
2885 		if (wrk->balancer && wrk->balancer->get_qd)
2886 			printf(" Average queue depths %.3f, %.3f.",
2887 			       (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS],
2888 			       (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]);
2889 		putchar('\n');
2890 	}
2891 
2892 	return NULL;
2893 }
2894 
fini_workload(struct workload * wrk)2895 static void fini_workload(struct workload *wrk)
2896 {
2897 	free(wrk->steps);
2898 	free(wrk);
2899 }
2900 
calibrate_nop(unsigned int tolerance_pct)2901 static unsigned long calibrate_nop(unsigned int tolerance_pct)
2902 {
2903 	const uint32_t bbe = 0xa << 23;
2904 	unsigned int loops = 17;
2905 	unsigned int usecs = nop_calibration_us;
2906 	struct drm_i915_gem_exec_object2 obj = {};
2907 	struct drm_i915_gem_execbuffer2 eb =
2908 		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
2909 	long size, last_size;
2910 	struct timespec t_0, t_end;
2911 
2912 	clock_gettime(CLOCK_MONOTONIC, &t_0);
2913 
2914 	size = 256 * 1024;
2915 	do {
2916 		struct timespec t_start;
2917 
2918 		obj.handle = gem_create(fd, size);
2919 		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
2920 			  sizeof(bbe));
2921 		gem_execbuf(fd, &eb);
2922 		gem_sync(fd, obj.handle);
2923 
2924 		clock_gettime(CLOCK_MONOTONIC, &t_start);
2925 		for (int loop = 0; loop < loops; loop++)
2926 			gem_execbuf(fd, &eb);
2927 		gem_sync(fd, obj.handle);
2928 		clock_gettime(CLOCK_MONOTONIC, &t_end);
2929 
2930 		gem_close(fd, obj.handle);
2931 
2932 		last_size = size;
2933 		size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
2934 		size = ALIGN(size, sizeof(uint32_t));
2935 	} while (elapsed(&t_0, &t_end) < 5 ||
2936 		 abs(size - last_size) > (size * tolerance_pct / 100));
2937 
2938 	return size / sizeof(uint32_t);
2939 }
2940 
print_help(void)2941 static void print_help(void)
2942 {
2943 	unsigned int i;
2944 
2945 	puts(
2946 "Usage: gem_wsim [OPTIONS]\n"
2947 "\n"
2948 "Runs a simulated workload on the GPU.\n"
2949 "When ran without arguments performs a GPU calibration result of which needs to\n"
2950 "be provided when running the simulation in subsequent invocations.\n"
2951 "\n"
2952 "Options:\n"
2953 "  -h              This text.\n"
2954 "  -q              Be quiet - do not output anything to stdout.\n"
2955 "  -n <n>          Nop calibration value.\n"
2956 "  -t <n>          Nop calibration tolerance percentage.\n"
2957 "                  Use when there is a difficulty obtaining calibration with the\n"
2958 "                  default settings.\n"
2959 "  -I <n>          Initial randomness seed.\n"
2960 "  -p <n>          Context priority to use for the following workload on the\n"
2961 "                  command line.\n"
2962 "  -w <desc|path>  Filename or a workload descriptor.\n"
2963 "                  Can be given multiple times.\n"
2964 "  -W <desc|path>  Filename or a master workload descriptor.\n"
2965 "                  Only one master workload can be optinally specified in which\n"
2966 "                  case all other workloads become background ones and run as\n"
2967 "                  long as the master.\n"
2968 "  -a <desc|path>  Append a workload to all other workloads.\n"
2969 "  -r <n>          How many times to emit the workload.\n"
2970 "  -c <n>          Fork N clients emitting the workload simultaneously.\n"
2971 "  -x              Swap VCS1 and VCS2 engines in every other client.\n"
2972 "  -b <n>          Load balancing to use.\n"
2973 "                  Available load balancers are:"
2974 	);
2975 
2976 	for (i = 0; i < ARRAY_SIZE(all_balancers); i++) {
2977 		igt_assert(all_balancers[i].desc);
2978 		printf(
2979 "                     %s (%u): %s\n",
2980 		       all_balancers[i].name, all_balancers[i].id,
2981 		       all_balancers[i].desc);
2982 	}
2983 	puts(
2984 "                  Balancers can be specified either as names or as their id\n"
2985 "                  number as listed above.\n"
2986 "  -2              Remap VCS2 to BCS.\n"
2987 "  -R              Round-robin initial VCS assignment per client.\n"
2988 "  -H              Send heartbeat on synchronisation points with seqno based\n"
2989 "                  balancers. Gives better engine busyness view in some cases.\n"
2990 "  -s              Turn on small SSEU config for the next workload on the\n"
2991 "                  command line. Subsequent -s switches it off.\n"
2992 "  -S              Synchronize the sequence of random batch durations between\n"
2993 "                  clients.\n"
2994 "  -G              Global load balancing - a single load balancer will be shared\n"
2995 "                  between all clients and there will be a single seqno domain.\n"
2996 "  -d              Sync between data dependencies in userspace."
2997 	);
2998 }
2999 
load_workload_descriptor(char * filename)3000 static char *load_workload_descriptor(char *filename)
3001 {
3002 	struct stat sbuf;
3003 	char *buf;
3004 	int infd, ret, i;
3005 	ssize_t len;
3006 
3007 	ret = stat(filename, &sbuf);
3008 	if (ret || !S_ISREG(sbuf.st_mode))
3009 		return filename;
3010 
3011 	igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */
3012 	buf = malloc(sbuf.st_size);
3013 	igt_assert(buf);
3014 
3015 	infd = open(filename, O_RDONLY);
3016 	igt_assert(infd >= 0);
3017 	len = read(infd, buf, sbuf.st_size);
3018 	igt_assert(len == sbuf.st_size);
3019 	close(infd);
3020 
3021 	for (i = 0; i < len; i++) {
3022 		if (buf[i] == '\n')
3023 			buf[i] = ',';
3024 	}
3025 
3026 	len--;
3027 	while (buf[len] == ',')
3028 		buf[len--] = 0;
3029 
3030 	return buf;
3031 }
3032 
3033 static struct w_arg *
add_workload_arg(struct w_arg * w_args,unsigned int nr_args,char * w_arg,int prio,bool sseu)3034 add_workload_arg(struct w_arg *w_args, unsigned int nr_args, char *w_arg,
3035 		 int prio, bool sseu)
3036 {
3037 	w_args = realloc(w_args, sizeof(*w_args) * nr_args);
3038 	igt_assert(w_args);
3039 	w_args[nr_args - 1] = (struct w_arg) { w_arg, NULL, prio, sseu };
3040 
3041 	return w_args;
3042 }
3043 
find_balancer_by_name(char * name)3044 static int find_balancer_by_name(char *name)
3045 {
3046 	unsigned int i;
3047 
3048 	for (i = 0; i < ARRAY_SIZE(all_balancers); i++) {
3049 		if (!strcasecmp(name, all_balancers[i].name))
3050 			return all_balancers[i].id;
3051 	}
3052 
3053 	return -1;
3054 }
3055 
find_balancer_by_id(unsigned int id)3056 static const struct workload_balancer *find_balancer_by_id(unsigned int id)
3057 {
3058 	unsigned int i;
3059 
3060 	for (i = 0; i < ARRAY_SIZE(all_balancers); i++) {
3061 		if (id == all_balancers[i].id)
3062 			return &all_balancers[i];
3063 	}
3064 
3065 	return NULL;
3066 }
3067 
init_clocks(void)3068 static void init_clocks(void)
3069 {
3070 	struct timespec t_start, t_end;
3071 	uint32_t rcs_start, rcs_end;
3072 	double overhead, t;
3073 
3074 	intel_register_access_init(intel_get_pci_device(), false, fd);
3075 
3076 	if (verbose <= 1)
3077 		return;
3078 
3079 	clock_gettime(CLOCK_MONOTONIC, &t_start);
3080 	for (int i = 0; i < 100; i++)
3081 		rcs_start = *REG(RCS_TIMESTAMP);
3082 	clock_gettime(CLOCK_MONOTONIC, &t_end);
3083 	overhead = 2 * elapsed(&t_start, &t_end) / 100;
3084 
3085 	clock_gettime(CLOCK_MONOTONIC, &t_start);
3086 	for (int i = 0; i < 100; i++)
3087 		clock_gettime(CLOCK_MONOTONIC, &t_end);
3088 	clock_gettime(CLOCK_MONOTONIC, &t_end);
3089 	overhead += elapsed(&t_start, &t_end) / 100;
3090 
3091 	clock_gettime(CLOCK_MONOTONIC, &t_start);
3092 	rcs_start = *REG(RCS_TIMESTAMP);
3093 	usleep(100);
3094 	rcs_end = *REG(RCS_TIMESTAMP);
3095 	clock_gettime(CLOCK_MONOTONIC, &t_end);
3096 
3097 	t = elapsed(&t_start, &t_end) - overhead;
3098 	printf("%d cycles in %.1fus, i.e. 1024 cycles takes %1.fus\n",
3099 	       rcs_end - rcs_start, 1e6*t, 1024e6 * t / (rcs_end - rcs_start));
3100 }
3101 
main(int argc,char ** argv)3102 int main(int argc, char **argv)
3103 {
3104 	unsigned int repeat = 1;
3105 	unsigned int clients = 1;
3106 	unsigned int flags = 0;
3107 	struct timespec t_start, t_end;
3108 	struct workload **w, **wrk = NULL;
3109 	struct workload *app_w = NULL;
3110 	unsigned int nr_w_args = 0;
3111 	int master_workload = -1;
3112 	char *append_workload_arg = NULL;
3113 	struct w_arg *w_args = NULL;
3114 	unsigned int tolerance_pct = 1;
3115 	const struct workload_balancer *balancer = NULL;
3116 	char *endptr = NULL;
3117 	int prio = 0;
3118 	double t;
3119 	int i, c;
3120 
3121 	/*
3122 	 * Open the device via the low-level API so we can do the GPU quiesce
3123 	 * manually as close as possible in time to the start of the workload.
3124 	 * This minimizes the gap in engine utilization tracking when observed
3125 	 * via external tools like trace.pl.
3126 	 */
3127 	fd = __drm_open_driver(DRIVER_INTEL);
3128 	igt_require(fd);
3129 
3130 	init_clocks();
3131 
3132 	master_prng = time(NULL);
3133 
3134 	while ((c = getopt(argc, argv,
3135 			   "hqv2RsSHxGdc:n:r:w:W:a:t:b:p:I:")) != -1) {
3136 		switch (c) {
3137 		case 'W':
3138 			if (master_workload >= 0) {
3139 				wsim_err("Only one master workload can be given!\n");
3140 				return 1;
3141 			}
3142 			master_workload = nr_w_args;
3143 			/* Fall through */
3144 		case 'w':
3145 			w_args = add_workload_arg(w_args, ++nr_w_args, optarg,
3146 						  prio, flags & SSEU);
3147 			break;
3148 		case 'p':
3149 			prio = atoi(optarg);
3150 			break;
3151 		case 'a':
3152 			if (append_workload_arg) {
3153 				wsim_err("Only one append workload can be given!\n");
3154 				return 1;
3155 			}
3156 			append_workload_arg = optarg;
3157 			break;
3158 		case 'c':
3159 			clients = strtol(optarg, NULL, 0);
3160 			break;
3161 		case 't':
3162 			tolerance_pct = strtol(optarg, NULL, 0);
3163 			break;
3164 		case 'n':
3165 			nop_calibration = strtol(optarg, NULL, 0);
3166 			break;
3167 		case 'r':
3168 			repeat = strtol(optarg, NULL, 0);
3169 			break;
3170 		case 'q':
3171 			verbose = 0;
3172 			break;
3173 		case 'v':
3174 			verbose++;
3175 			break;
3176 		case 'x':
3177 			flags |= SWAPVCS;
3178 			break;
3179 		case '2':
3180 			flags |= VCS2REMAP;
3181 			break;
3182 		case 'R':
3183 			flags |= INITVCSRR;
3184 			break;
3185 		case 'S':
3186 			flags |= SYNCEDCLIENTS;
3187 			break;
3188 		case 's':
3189 			flags ^= SSEU;
3190 			break;
3191 		case 'H':
3192 			flags |= HEARTBEAT;
3193 			break;
3194 		case 'G':
3195 			flags |= GLOBAL_BALANCE;
3196 			break;
3197 		case 'd':
3198 			flags |= DEPSYNC;
3199 			break;
3200 		case 'b':
3201 			i = find_balancer_by_name(optarg);
3202 			if (i < 0) {
3203 				i = strtol(optarg, &endptr, 0);
3204 				if (endptr && *endptr)
3205 					i = -1;
3206 			}
3207 
3208 			if (i >= 0) {
3209 				balancer = find_balancer_by_id(i);
3210 				if (balancer) {
3211 					igt_assert(intel_gen(intel_get_drm_devid(fd)) >= balancer->min_gen);
3212 					flags |= BALANCE | balancer->flags;
3213 				}
3214 			}
3215 
3216 			if (!balancer) {
3217 				wsim_err("Unknown balancing mode '%s'!\n",
3218 					 optarg);
3219 				return 1;
3220 			}
3221 			break;
3222 		case 'I':
3223 			master_prng = strtol(optarg, NULL, 0);
3224 			break;
3225 		case 'h':
3226 			print_help();
3227 			return 0;
3228 		default:
3229 			return 1;
3230 		}
3231 	}
3232 
3233 	if ((flags & HEARTBEAT) && !(flags & SEQNO)) {
3234 		wsim_err("Heartbeat needs a seqno based balancer!\n");
3235 		return 1;
3236 	}
3237 
3238 	if ((flags & VCS2REMAP) && (flags & I915)) {
3239 		wsim_err("VCS remapping not supported with i915 balancing!\n");
3240 		return 1;
3241 	}
3242 
3243 	if (!nop_calibration) {
3244 		if (verbose > 1)
3245 			printf("Calibrating nop delay with %u%% tolerance...\n",
3246 				tolerance_pct);
3247 		nop_calibration = calibrate_nop(tolerance_pct);
3248 		if (verbose)
3249 			printf("Nop calibration for %uus delay is %lu.\n",
3250 			       nop_calibration_us, nop_calibration);
3251 
3252 		return 0;
3253 	}
3254 
3255 	if (!nr_w_args) {
3256 		wsim_err("No workload descriptor(s)!\n");
3257 		return 1;
3258 	}
3259 
3260 	if (nr_w_args > 1 && clients > 1) {
3261 		wsim_err("Cloned clients cannot be combined with multiple workloads!\n");
3262 		return 1;
3263 	}
3264 
3265 	if ((flags & GLOBAL_BALANCE) && !balancer) {
3266 		wsim_err("Balancer not specified in global balancing mode!\n");
3267 		return 1;
3268 	}
3269 
3270 	if (append_workload_arg) {
3271 		append_workload_arg = load_workload_descriptor(append_workload_arg);
3272 		if (!append_workload_arg) {
3273 			wsim_err("Failed to load append workload descriptor!\n");
3274 			return 1;
3275 		}
3276 	}
3277 
3278 	if (append_workload_arg) {
3279 		struct w_arg arg = { NULL, append_workload_arg, 0 };
3280 		app_w = parse_workload(&arg, flags, NULL);
3281 		if (!app_w) {
3282 			wsim_err("Failed to parse append workload!\n");
3283 			return 1;
3284 		}
3285 	}
3286 
3287 	wrk = calloc(nr_w_args, sizeof(*wrk));
3288 	igt_assert(wrk);
3289 
3290 	for (i = 0; i < nr_w_args; i++) {
3291 		w_args[i].desc = load_workload_descriptor(w_args[i].filename);
3292 
3293 		if (!w_args[i].desc) {
3294 			wsim_err("Failed to load workload descriptor %u!\n", i);
3295 			return 1;
3296 		}
3297 
3298 		wrk[i] = parse_workload(&w_args[i], flags, app_w);
3299 		if (!wrk[i]) {
3300 			wsim_err("Failed to parse workload %u!\n", i);
3301 			return 1;
3302 		}
3303 	}
3304 
3305 	if (nr_w_args > 1)
3306 		clients = nr_w_args;
3307 
3308 	if (verbose > 1) {
3309 		printf("Random seed is %u.\n", master_prng);
3310 		printf("Using %lu nop calibration for %uus delay.\n",
3311 		       nop_calibration, nop_calibration_us);
3312 		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
3313 		if (flags & SWAPVCS)
3314 			printf("Swapping VCS rings between clients.\n");
3315 		if (flags & GLOBAL_BALANCE) {
3316 			if (flags & I915) {
3317 				printf("Ignoring global balancing with i915!\n");
3318 				flags &= ~GLOBAL_BALANCE;
3319 			} else {
3320 				printf("Using %s balancer in global mode.\n",
3321 				       balancer->name);
3322 			}
3323 		} else if (balancer) {
3324 			printf("Using %s balancer.\n", balancer->name);
3325 		}
3326 	}
3327 
3328 	srand(master_prng);
3329 	master_prng = rand();
3330 
3331 	if (master_workload >= 0 && clients == 1)
3332 		master_workload = -1;
3333 
3334 	w = calloc(clients, sizeof(struct workload *));
3335 	igt_assert(w);
3336 
3337 	for (i = 0; i < clients; i++) {
3338 		unsigned int flags_ = flags;
3339 
3340 		w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]);
3341 
3342 		if (flags & SWAPVCS && i & 1)
3343 			flags_ &= ~SWAPVCS;
3344 
3345 		if ((flags & GLOBAL_BALANCE) && !(flags & I915)) {
3346 			w[i]->balancer = &global_balancer;
3347 			w[i]->global_wrk = w[0];
3348 			w[i]->global_balancer = balancer;
3349 		} else {
3350 			w[i]->balancer = balancer;
3351 		}
3352 
3353 		w[i]->flags = flags;
3354 		w[i]->repeat = repeat;
3355 		w[i]->background = master_workload >= 0 && i != master_workload;
3356 		w[i]->print_stats = verbose > 1 ||
3357 				    (verbose > 0 && master_workload == i);
3358 
3359 		if (prepare_workload(i, w[i], flags_)) {
3360 			wsim_err("Failed to prepare workload %u!\n", i);
3361 			return 1;
3362 		}
3363 
3364 
3365 		if (balancer && balancer->init) {
3366 			int ret = balancer->init(balancer, w[i]);
3367 			if (ret) {
3368 				wsim_err("Failed to initialize balancing! (%u=%d)\n",
3369 					 i, ret);
3370 				return 1;
3371 			}
3372 		}
3373 	}
3374 
3375 	gem_quiescent_gpu(fd);
3376 
3377 	clock_gettime(CLOCK_MONOTONIC, &t_start);
3378 
3379 	for (i = 0; i < clients; i++) {
3380 		int ret;
3381 
3382 		ret = pthread_create(&w[i]->thread, NULL, run_workload, w[i]);
3383 		igt_assert_eq(ret, 0);
3384 	}
3385 
3386 	if (master_workload >= 0) {
3387 		int ret = pthread_join(w[master_workload]->thread, NULL);
3388 
3389 		igt_assert(ret == 0);
3390 
3391 		for (i = 0; i < clients; i++)
3392 			w[i]->run = false;
3393 	}
3394 
3395 	for (i = 0; i < clients; i++) {
3396 		if (master_workload != i) {
3397 			int ret = pthread_join(w[i]->thread, NULL);
3398 			igt_assert(ret == 0);
3399 		}
3400 	}
3401 
3402 	clock_gettime(CLOCK_MONOTONIC, &t_end);
3403 
3404 	t = elapsed(&t_start, &t_end);
3405 	if (verbose)
3406 		printf("%.3fs elapsed (%.3f workloads/s)\n",
3407 		       t, clients * repeat / t);
3408 
3409 	for (i = 0; i < clients; i++)
3410 		fini_workload(w[i]);
3411 	free(w);
3412 	for (i = 0; i < nr_w_args; i++)
3413 		fini_workload(wrk[i]);
3414 	free(w_args);
3415 
3416 	return 0;
3417 }
3418