1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "gpu_cmds.h"
26 
27 void
gen7_render_flush(struct intel_batchbuffer * batch,uint32_t batch_end)28 gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
29 {
30 	int ret;
31 
32 	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
33 	if (ret == 0)
34 		ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
35 					    NULL, 0, 0, 0);
36 	igt_assert(ret == 0);
37 }
38 
39 void
gen7_render_context_flush(struct intel_batchbuffer * batch,uint32_t batch_end)40 gen7_render_context_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
41 {
42 	int ret;
43 
44 	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
45 	if (ret == 0)
46 		ret = drm_intel_gem_bo_context_exec(batch->bo, batch->ctx,
47 				batch_end, 0);
48 	igt_assert(ret == 0);
49 }
50 
51 uint32_t
gen7_fill_curbe_buffer_data(struct intel_batchbuffer * batch,uint8_t color)52 gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
53 			    uint8_t color)
54 {
55 	uint8_t *curbe_buffer;
56 	uint32_t offset;
57 
58 	curbe_buffer = intel_batchbuffer_subdata_alloc(batch,
59 						       sizeof(uint32_t) * 8,
60 						       64);
61 	offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
62 	*curbe_buffer = color;
63 
64 	return offset;
65 }
66 
67 uint32_t
gen11_fill_curbe_buffer_data(struct intel_batchbuffer * batch)68 gen11_fill_curbe_buffer_data(struct intel_batchbuffer *batch)
69 {
70 	uint32_t *curbe_buffer;
71 	uint32_t offset;
72 
73 	curbe_buffer = intel_batchbuffer_subdata_alloc(batch,
74 						       sizeof(uint32_t) * 8,
75 						       64);
76 	offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
77 	*curbe_buffer++ = 0;
78 	*curbe_buffer   = 1;
79 
80 	return offset;
81 }
82 
83 uint32_t
gen7_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t format,int is_dst)84 gen7_fill_surface_state(struct intel_batchbuffer *batch,
85 			const struct igt_buf *buf,
86 			uint32_t format,
87 			int is_dst)
88 {
89 	struct gen7_surface_state *ss;
90 	uint32_t write_domain, read_domain, offset;
91 	int ret;
92 
93 	if (is_dst) {
94 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
95 	} else {
96 		write_domain = 0;
97 		read_domain = I915_GEM_DOMAIN_SAMPLER;
98 	}
99 
100 	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
101 	offset = intel_batchbuffer_subdata_offset(batch, ss);
102 
103 	ss->ss0.surface_type = SURFACE_2D;
104 	ss->ss0.surface_format = format;
105 	ss->ss0.render_cache_read_write = 1;
106 
107 	if (buf->tiling == I915_TILING_X)
108 		ss->ss0.tiled_mode = 2;
109 	else if (buf->tiling == I915_TILING_Y)
110 		ss->ss0.tiled_mode = 3;
111 
112 	ss->ss1.base_addr = buf->bo->offset;
113 	ret = drm_intel_bo_emit_reloc(batch->bo,
114 				intel_batchbuffer_subdata_offset(batch, ss) + 4,
115 				buf->bo, 0,
116 				read_domain, write_domain);
117 	igt_assert(ret == 0);
118 
119 	ss->ss2.height = igt_buf_height(buf) - 1;
120 	ss->ss2.width  = igt_buf_width(buf) - 1;
121 
122 	ss->ss3.pitch  = buf->stride - 1;
123 
124 	ss->ss7.shader_chanel_select_r = 4;
125 	ss->ss7.shader_chanel_select_g = 5;
126 	ss->ss7.shader_chanel_select_b = 6;
127 	ss->ss7.shader_chanel_select_a = 7;
128 
129 	return offset;
130 }
131 
132 uint32_t
gen7_fill_binding_table(struct intel_batchbuffer * batch,const struct igt_buf * dst)133 gen7_fill_binding_table(struct intel_batchbuffer *batch,
134 			const struct igt_buf *dst)
135 {
136 	uint32_t *binding_table, offset;
137 
138 	binding_table = intel_batchbuffer_subdata_alloc(batch, 32, 64);
139 	offset = intel_batchbuffer_subdata_offset(batch, binding_table);
140 	if (IS_GEN7(batch->devid))
141 		binding_table[0] = gen7_fill_surface_state(batch, dst,
142 						SURFACEFORMAT_R8_UNORM, 1);
143 	else
144 		binding_table[0] = gen8_fill_surface_state(batch, dst,
145 						SURFACEFORMAT_R8_UNORM, 1);
146 
147 	return offset;
148 }
149 
150 uint32_t
gen11_fill_binding_table(struct intel_batchbuffer * batch,const struct igt_buf * src,const struct igt_buf * dst)151 gen11_fill_binding_table(struct intel_batchbuffer *batch,
152 			const struct igt_buf *src,const struct igt_buf *dst)
153 {
154 	uint32_t *binding_table, offset;
155 
156 	binding_table = intel_batchbuffer_subdata_alloc(batch, 64, 64);
157 	offset = intel_batchbuffer_subdata_offset(batch, binding_table);
158 	binding_table[0] = gen11_fill_surface_state(batch, src,
159 						SURFACE_1D,SURFACEFORMAT_R32G32B32A32_FLOAT,
160 						0,0,
161 						0);
162 	binding_table[1] = gen11_fill_surface_state(batch, dst,
163 						SURFACE_BUFFER, SURFACEFORMAT_RAW,
164 						1,1,
165 						1);
166 
167 	return offset;
168 }
169 
170 uint32_t
gen7_fill_kernel(struct intel_batchbuffer * batch,const uint32_t kernel[][4],size_t size)171 gen7_fill_kernel(struct intel_batchbuffer *batch,
172 		const uint32_t kernel[][4],
173 		size_t size)
174 {
175 	uint32_t offset;
176 
177 	offset = intel_batchbuffer_copy_data(batch, kernel, size, 64);
178 
179 	return offset;
180 }
181 
182 uint32_t
gen7_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)183 gen7_fill_interface_descriptor(struct intel_batchbuffer *batch,
184 			       const struct igt_buf *dst,
185 			       const uint32_t kernel[][4],
186 			       size_t size)
187 {
188 	struct gen7_interface_descriptor_data *idd;
189 	uint32_t offset;
190 	uint32_t binding_table_offset, kernel_offset;
191 
192 	binding_table_offset = gen7_fill_binding_table(batch, dst);
193 	kernel_offset = gen7_fill_kernel(batch, kernel, size);
194 
195 	idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
196 	offset = intel_batchbuffer_subdata_offset(batch, idd);
197 
198 	idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
199 
200 	idd->desc1.single_program_flow = 1;
201 	idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
202 
203 	idd->desc2.sampler_count = 0;      /* 0 samplers used */
204 	idd->desc2.sampler_state_pointer = 0;
205 
206 	idd->desc3.binding_table_entry_count = 0;
207 	idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
208 
209 	idd->desc4.constant_urb_entry_read_offset = 0;
210 	idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
211 
212 	return offset;
213 }
214 
215 void
gen7_emit_state_base_address(struct intel_batchbuffer * batch)216 gen7_emit_state_base_address(struct intel_batchbuffer *batch)
217 {
218 	OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
219 
220 	/* general */
221 	OUT_BATCH(0);
222 
223 	/* surface */
224 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
225 		  BASE_ADDRESS_MODIFY);
226 
227 	/* dynamic */
228 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
229 		  BASE_ADDRESS_MODIFY);
230 
231 	/* indirect */
232 	OUT_BATCH(0);
233 
234 	/* instruction */
235 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
236 		  BASE_ADDRESS_MODIFY);
237 
238 	/* general/dynamic/indirect/instruction access Bound */
239 	OUT_BATCH(0);
240 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
241 	OUT_BATCH(0);
242 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
243 }
244 
245 void
gen7_emit_vfe_state(struct intel_batchbuffer * batch,uint32_t threads,uint32_t urb_entries,uint32_t urb_size,uint32_t curbe_size,uint32_t mode)246 gen7_emit_vfe_state(struct intel_batchbuffer *batch, uint32_t threads,
247 		    uint32_t urb_entries, uint32_t urb_size,
248 		    uint32_t curbe_size, uint32_t mode)
249 {
250 	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
251 
252 	/* scratch buffer */
253 	OUT_BATCH(0);
254 
255 	/* number of threads & urb entries */
256 	OUT_BATCH(threads << 16 |
257 		urb_entries << 8 |
258 		mode << 2); /* GPGPU vs media mode */
259 
260 	OUT_BATCH(0);
261 
262 	/* urb entry size & curbe size */
263 	OUT_BATCH(urb_size << 16 |	/* in 256 bits unit */
264 		  curbe_size);		/* in 256 bits unit */
265 
266 	/* scoreboard */
267 	OUT_BATCH(0);
268 	OUT_BATCH(0);
269 	OUT_BATCH(0);
270 }
271 
272 void
gen7_emit_curbe_load(struct intel_batchbuffer * batch,uint32_t curbe_buffer)273 gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t curbe_buffer)
274 {
275 	OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
276 	OUT_BATCH(0);
277 	/* curbe total data length */
278 	OUT_BATCH(64);
279 	/* curbe data start address, is relative to the dynamics base address */
280 	OUT_BATCH(curbe_buffer);
281 }
282 
283 void
gen7_emit_interface_descriptor_load(struct intel_batchbuffer * batch,uint32_t interface_descriptor)284 gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch,
285 				    uint32_t interface_descriptor)
286 {
287 	OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
288 	OUT_BATCH(0);
289 	/* interface descriptor data length */
290 	if (IS_GEN7(batch->devid))
291 		OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
292 	else
293 		OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
294 	/* interface descriptor address, is relative to the dynamics base
295 	 * address
296 	 */
297 	OUT_BATCH(interface_descriptor);
298 }
299 
300 void
gen7_emit_media_objects(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)301 gen7_emit_media_objects(struct intel_batchbuffer *batch,
302 			unsigned int x, unsigned int y,
303 			unsigned int width, unsigned int height)
304 {
305 	int i, j;
306 
307 	for (i = 0; i < width / 16; i++) {
308 		for (j = 0; j < height / 16; j++) {
309 			gen_emit_media_object(batch, x + i * 16, y + j * 16);
310 		}
311 	}
312 }
313 
314 void
gen7_emit_gpgpu_walk(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)315 gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
316 		     unsigned int x, unsigned int y,
317 		     unsigned int width, unsigned int height)
318 {
319 	uint32_t x_dim, y_dim, tmp, right_mask;
320 
321 	/*
322 	 * Simply do SIMD16 based dispatch, so every thread uses
323 	 * SIMD16 channels.
324 	 *
325 	 * Define our own thread group size, e.g 16x1 for every group, then
326 	 * will have 1 thread each group in SIMD16 dispatch. So thread
327 	 * width/height/depth are all 1.
328 	 *
329 	 * Then thread group X = width / 16 (aligned to 16)
330 	 * thread group Y = height;
331 	 */
332 	x_dim = (width + 15) / 16;
333 	y_dim = height;
334 
335 	tmp = width & 15;
336 	if (tmp == 0)
337 		right_mask = (1 << 16) - 1;
338 	else
339 		right_mask = (1 << tmp) - 1;
340 
341 	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
342 
343 	/* interface descriptor offset */
344 	OUT_BATCH(0);
345 
346 	/* SIMD size, thread w/h/d */
347 	OUT_BATCH(1 << 30 | /* SIMD16 */
348 		  0 << 16 | /* depth:1 */
349 		  0 << 8 | /* height:1 */
350 		  0); /* width:1 */
351 
352 	/* thread group X */
353 	OUT_BATCH(0);
354 	OUT_BATCH(x_dim);
355 
356 	/* thread group Y */
357 	OUT_BATCH(0);
358 	OUT_BATCH(y_dim);
359 
360 	/* thread group Z */
361 	OUT_BATCH(0);
362 	OUT_BATCH(1);
363 
364 	/* right mask */
365 	OUT_BATCH(right_mask);
366 
367 	/* bottom mask, height 1, always 0xffffffff */
368 	OUT_BATCH(0xffffffff);
369 }
370 
371 uint32_t
gen8_spin_curbe_buffer_data(struct intel_batchbuffer * batch,uint32_t iters)372 gen8_spin_curbe_buffer_data(struct intel_batchbuffer *batch,
373 			    uint32_t iters)
374 {
375 	uint32_t *curbe_buffer;
376 	uint32_t offset;
377 
378 	curbe_buffer = intel_batchbuffer_subdata_alloc(batch, 64, 64);
379 	offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
380 	*curbe_buffer = iters;
381 
382 	return offset;
383 }
384 
385 uint32_t
gen8_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t format,int is_dst)386 gen8_fill_surface_state(struct intel_batchbuffer *batch,
387 			const struct igt_buf *buf,
388 			uint32_t format,
389 			int is_dst)
390 {
391 	struct gen8_surface_state *ss;
392 	uint32_t write_domain, read_domain, offset;
393 	int ret;
394 
395 	if (is_dst) {
396 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
397 	} else {
398 		write_domain = 0;
399 		read_domain = I915_GEM_DOMAIN_SAMPLER;
400 	}
401 
402 	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
403 	offset = intel_batchbuffer_subdata_offset(batch, ss);
404 
405 	ss->ss0.surface_type = SURFACE_2D;
406 	ss->ss0.surface_format = format;
407 	ss->ss0.render_cache_read_write = 1;
408 	ss->ss0.vertical_alignment = 1; /* align 4 */
409 	ss->ss0.horizontal_alignment = 1; /* align 4 */
410 
411 	if (buf->tiling == I915_TILING_X)
412 		ss->ss0.tiled_mode = 2;
413 	else if (buf->tiling == I915_TILING_Y)
414 		ss->ss0.tiled_mode = 3;
415 
416 	ss->ss8.base_addr = buf->bo->offset;
417 
418 	ret = drm_intel_bo_emit_reloc(batch->bo,
419 				intel_batchbuffer_subdata_offset(batch, ss) + 8 * 4,
420 				buf->bo, 0, read_domain, write_domain);
421 	igt_assert(ret == 0);
422 
423 	ss->ss2.height = igt_buf_height(buf) - 1;
424 	ss->ss2.width  = igt_buf_width(buf) - 1;
425 	ss->ss3.pitch  = buf->stride - 1;
426 
427 	ss->ss7.shader_chanel_select_r = 4;
428 	ss->ss7.shader_chanel_select_g = 5;
429 	ss->ss7.shader_chanel_select_b = 6;
430 	ss->ss7.shader_chanel_select_a = 7;
431 
432 	return offset;
433 }
434 
435 uint32_t
gen11_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t surface_type,uint32_t format,uint32_t vertical_alignment,uint32_t horizontal_alignment,int is_dst)436 gen11_fill_surface_state(struct intel_batchbuffer *batch,
437 			const struct igt_buf *buf,
438 			uint32_t surface_type,
439 			uint32_t format,
440 			uint32_t vertical_alignment,
441 			uint32_t horizontal_alignment,
442 			int is_dst)
443 {
444 	struct gen8_surface_state *ss;
445 	uint32_t write_domain, read_domain, offset;
446 	int ret;
447 
448 	if (is_dst) {
449 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
450 	} else {
451 		write_domain = 0;
452 		read_domain = I915_GEM_DOMAIN_SAMPLER;
453 	}
454 
455 	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
456 	offset = intel_batchbuffer_subdata_offset(batch, ss);
457 
458 	ss->ss0.surface_type = surface_type;
459 	ss->ss0.surface_format = format;
460 	ss->ss0.render_cache_read_write = 1;
461 	ss->ss0.vertical_alignment = vertical_alignment; /* align 4 */
462 	ss->ss0.horizontal_alignment = horizontal_alignment; /* align 4 */
463 
464 	if (buf->tiling == I915_TILING_X)
465 		ss->ss0.tiled_mode = 2;
466 	else if (buf->tiling == I915_TILING_Y)
467 		ss->ss0.tiled_mode = 3;
468 	else
469 		ss->ss0.tiled_mode = 0;
470 
471 	ss->ss8.base_addr = buf->bo->offset;
472 
473 	ret = drm_intel_bo_emit_reloc(batch->bo,
474 				intel_batchbuffer_subdata_offset(batch, ss) + 8 * 4,
475 				buf->bo, 0, read_domain, write_domain);
476 	igt_assert(ret == 0);
477 
478 	if (is_dst) {
479 		ss->ss1.memory_object_control = 2;
480 		ss->ss2.height = 1;
481 		ss->ss2.width  = 95;
482 		ss->ss3.pitch  = 0;
483 		ss->ss7.shader_chanel_select_r = 4;
484 		ss->ss7.shader_chanel_select_g = 5;
485 		ss->ss7.shader_chanel_select_b = 6;
486 		ss->ss7.shader_chanel_select_a = 7;
487 	}
488 	else {
489 		ss->ss1.qpitch = 4040;
490 		ss->ss1.base_mip_level = 31;
491 		ss->ss2.height = 9216;
492 		ss->ss2.width  = 1019;
493 		ss->ss3.pitch  = 64;
494 		ss->ss5.mip_count = 2;
495 	}
496 
497 	return offset;
498 }
499 
500 uint32_t
gen8_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)501 gen8_fill_interface_descriptor(struct intel_batchbuffer *batch,
502 			       const struct igt_buf *dst,
503 			       const uint32_t kernel[][4],
504 			       size_t size)
505 {
506 	struct gen8_interface_descriptor_data *idd;
507 	uint32_t offset;
508 	uint32_t binding_table_offset, kernel_offset;
509 
510 	binding_table_offset = gen7_fill_binding_table(batch, dst);
511 	kernel_offset = gen7_fill_kernel(batch, kernel, size);
512 
513 	idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
514 	offset = intel_batchbuffer_subdata_offset(batch, idd);
515 
516 	idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
517 
518 	idd->desc2.single_program_flow = 1;
519 	idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
520 
521 	idd->desc3.sampler_count = 0;      /* 0 samplers used */
522 	idd->desc3.sampler_state_pointer = 0;
523 
524 	idd->desc4.binding_table_entry_count = 0;
525 	idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
526 
527 	idd->desc5.constant_urb_entry_read_offset = 0;
528 	idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
529 
530 	idd->desc6.num_threads_in_tg = 1;
531 
532 	return offset;
533 }
534 
535 uint32_t
gen11_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * src,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)536 gen11_fill_interface_descriptor(struct intel_batchbuffer *batch,
537 			       const struct igt_buf *src,const struct igt_buf *dst,
538 			       const uint32_t kernel[][4],
539 			       size_t size)
540 {
541 	struct gen8_interface_descriptor_data *idd;
542 	uint32_t offset;
543 	uint32_t binding_table_offset, kernel_offset;
544 
545 	binding_table_offset = gen11_fill_binding_table(batch, src,dst);
546 	kernel_offset = gen7_fill_kernel(batch, kernel, size);
547 
548 	idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
549 	offset = intel_batchbuffer_subdata_offset(batch, idd);
550 
551 	idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
552 
553 	idd->desc2.single_program_flow = 1;
554 	idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
555 
556 	idd->desc3.sampler_count = 0;      /* 0 samplers used */
557 	idd->desc3.sampler_state_pointer = 0;
558 
559 	idd->desc4.binding_table_entry_count = 0;
560 	idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
561 
562 	idd->desc5.constant_urb_entry_read_offset = 0;
563 	idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
564 
565 	idd->desc6.num_threads_in_tg = 1;
566 
567 	return offset;
568 }
569 
570 void
gen8_emit_state_base_address(struct intel_batchbuffer * batch)571 gen8_emit_state_base_address(struct intel_batchbuffer *batch)
572 {
573 	OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
574 
575 	/* general */
576 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
577 	OUT_BATCH(0);
578 
579 	/* stateless data port */
580 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
581 
582 	/* surface */
583 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
584 
585 	/* dynamic */
586 	OUT_RELOC(batch->bo,
587 		  I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
588 		  0, BASE_ADDRESS_MODIFY);
589 
590 	/* indirect */
591 	OUT_BATCH(0);
592 	OUT_BATCH(0);
593 
594 	/* instruction */
595 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
596 		  BASE_ADDRESS_MODIFY);
597 
598 	/* general state buffer size */
599 	OUT_BATCH(0xfffff000 | 1);
600 	/* dynamic state buffer size */
601 	OUT_BATCH(1 << 12 | 1);
602 	/* indirect object buffer size */
603 	OUT_BATCH(0xfffff000 | 1);
604 	/* instruction buffer size, must set modify enable bit, otherwise it may
605 	 * result in GPU hang
606 	 */
607 	OUT_BATCH(1 << 12 | 1);
608 }
609 
610 void
gen8_emit_media_state_flush(struct intel_batchbuffer * batch)611 gen8_emit_media_state_flush(struct intel_batchbuffer *batch)
612 {
613 	OUT_BATCH(GEN8_MEDIA_STATE_FLUSH | (2 - 2));
614 	OUT_BATCH(0);
615 }
616 
617 void
gen8_emit_vfe_state(struct intel_batchbuffer * batch,uint32_t threads,uint32_t urb_entries,uint32_t urb_size,uint32_t curbe_size)618 gen8_emit_vfe_state(struct intel_batchbuffer *batch, uint32_t threads,
619 		    uint32_t urb_entries, uint32_t urb_size,
620 		    uint32_t curbe_size)
621 {
622 	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (9 - 2));
623 
624 	/* scratch buffer */
625 	OUT_BATCH(0);
626 	OUT_BATCH(0);
627 
628 	/* number of threads & urb entries */
629 	OUT_BATCH(threads << 16 |
630 		urb_entries << 8);
631 
632 	OUT_BATCH(0);
633 
634 	/* urb entry size & curbe size */
635 	OUT_BATCH(urb_size << 16 |
636 		curbe_size);
637 
638 	/* scoreboard */
639 	OUT_BATCH(0);
640 	OUT_BATCH(0);
641 	OUT_BATCH(0);
642 }
643 
644 void
gen8_emit_gpgpu_walk(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)645 gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
646 		     unsigned int x, unsigned int y,
647 		     unsigned int width, unsigned int height)
648 {
649 	uint32_t x_dim, y_dim, tmp, right_mask;
650 
651 	/*
652 	 * Simply do SIMD16 based dispatch, so every thread uses
653 	 * SIMD16 channels.
654 	 *
655 	 * Define our own thread group size, e.g 16x1 for every group, then
656 	 * will have 1 thread each group in SIMD16 dispatch. So thread
657 	 * width/height/depth are all 1.
658 	 *
659 	 * Then thread group X = width / 16 (aligned to 16)
660 	 * thread group Y = height;
661 	 */
662 	x_dim = (width + 15) / 16;
663 	y_dim = height;
664 
665 	tmp = width & 15;
666 	if (tmp == 0)
667 		right_mask = (1 << 16) - 1;
668 	else
669 		right_mask = (1 << tmp) - 1;
670 
671 	OUT_BATCH(GEN7_GPGPU_WALKER | 13);
672 
673 	OUT_BATCH(0); /* kernel offset */
674 	OUT_BATCH(0); /* indirect data length */
675 	OUT_BATCH(0); /* indirect data offset */
676 
677 	/* SIMD size, thread w/h/d */
678 	OUT_BATCH(1 << 30 | /* SIMD16 */
679 		  0 << 16 | /* depth:1 */
680 		  0 << 8 | /* height:1 */
681 		  0); /* width:1 */
682 
683 	/* thread group X */
684 	OUT_BATCH(0);
685 	OUT_BATCH(0);
686 	OUT_BATCH(x_dim);
687 
688 	/* thread group Y */
689 	OUT_BATCH(0);
690 	OUT_BATCH(0);
691 	OUT_BATCH(y_dim);
692 
693 	/* thread group Z */
694 	OUT_BATCH(0);
695 	OUT_BATCH(1);
696 
697 	/* right mask */
698 	OUT_BATCH(right_mask);
699 
700 	/* bottom mask, height 1, always 0xffffffff */
701 	OUT_BATCH(0xffffffff);
702 }
703 
704 void
gen_emit_media_object(struct intel_batchbuffer * batch,unsigned int xoffset,unsigned int yoffset)705 gen_emit_media_object(struct intel_batchbuffer *batch,
706 		       unsigned int xoffset, unsigned int yoffset)
707 {
708 	OUT_BATCH(GEN7_MEDIA_OBJECT | (8 - 2));
709 
710 	/* interface descriptor offset */
711 	OUT_BATCH(0);
712 
713 	/* without indirect data */
714 	OUT_BATCH(0);
715 	OUT_BATCH(0);
716 
717 	/* scoreboard */
718 	OUT_BATCH(0);
719 	OUT_BATCH(0);
720 
721 	/* inline data (xoffset, yoffset) */
722 	OUT_BATCH(xoffset);
723 	OUT_BATCH(yoffset);
724 	if (AT_LEAST_GEN(batch->devid, 8) && !IS_CHERRYVIEW(batch->devid))
725 		gen8_emit_media_state_flush(batch);
726 }
727 
728 void
gen9_emit_state_base_address(struct intel_batchbuffer * batch)729 gen9_emit_state_base_address(struct intel_batchbuffer *batch)
730 {
731 	OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
732 
733 	/* general */
734 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
735 	OUT_BATCH(0);
736 
737 	/* stateless data port */
738 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
739 
740 	/* surface */
741 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
742 
743 	/* dynamic */
744 	OUT_RELOC(batch->bo,
745 		  I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
746 		  0, BASE_ADDRESS_MODIFY);
747 
748 	/* indirect */
749 	OUT_BATCH(0);
750 	OUT_BATCH(0);
751 
752 	/* instruction */
753 	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
754 		  BASE_ADDRESS_MODIFY);
755 
756 	/* general state buffer size */
757 	OUT_BATCH(0xfffff000 | 1);
758 	/* dynamic state buffer size */
759 	OUT_BATCH(1 << 12 | 1);
760 	/* indirect object buffer size */
761 	OUT_BATCH(0xfffff000 | 1);
762 	/* intruction buffer size, must set modify enable bit, otherwise it may
763 	 * result in GPU hang
764 	 */
765 	OUT_BATCH(1 << 12 | 1);
766 
767 	/* Bindless surface state base address */
768 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
769 	OUT_BATCH(0);
770 	OUT_BATCH(0xfffff000);
771 }
772