1 /*
2  * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Adam Rak <adam.rak@streamnovation.com>
25  */
26 
27 #include <stdio.h>
28 #include <errno.h>
29 #include "pipe/p_defines.h"
30 #include "pipe/p_state.h"
31 #include "pipe/p_context.h"
32 #include "util/u_blitter.h"
33 #include "util/u_double_list.h"
34 #include "util/u_transfer.h"
35 #include "util/u_surface.h"
36 #include "util/u_pack_color.h"
37 #include "util/u_memory.h"
38 #include "util/u_inlines.h"
39 #include "util/u_framebuffer.h"
40 #include "pipebuffer/pb_buffer.h"
41 #include "r600.h"
42 #include "evergreend.h"
43 #include "r600_resource.h"
44 #include "r600_shader.h"
45 #include "r600_pipe.h"
46 #include "r600_formats.h"
47 #include "evergreen_compute.h"
48 #include "r600_hw_context_priv.h"
49 #include "evergreen_compute_internal.h"
50 #include "compute_memory_pool.h"
51 #ifdef HAVE_OPENCL
52 #include "llvm_wrapper.h"
53 #endif
54 
55 /**
56 RAT0 is for global binding write
57 VTX1 is for global binding read
58 
59 for wrting images RAT1...
60 for reading images TEX2...
61   TEX2-RAT1 is paired
62 
63 TEX2... consumes the same fetch resources, that VTX2... would consume
64 
65 CONST0 and VTX0 is for parameters
66   CONST0 is binding smaller input parameter buffer, and for constant indexing,
67   also constant cached
68   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69   the constant cache can handle
70 
71 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72 because we reserve RAT0 for global bindings. With byteaddressing enabled,
73 we should reserve another one too.=> 10 image binding for writing max.
74 
75 from Nvidia OpenCL:
76   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78 
79 so 10 for writing is enough. 176 is the max for reading according to the docs
80 
81 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82 writable images will consume TEX slots, VTX slots too because of linear indexing
83 
84 */
85 
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)86 static void evergreen_cs_set_vertex_buffer(
87 	struct r600_context * rctx,
88 	unsigned vb_index,
89 	unsigned offset,
90 	struct pipe_resource * buffer)
91 {
92 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
93 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
94 	vb->stride = 1;
95 	vb->buffer_offset = offset;
96 	vb->buffer = buffer;
97 	vb->user_buffer = NULL;
98 
99 	r600_inval_vertex_cache(rctx);
100 	state->enabled_mask |= 1 << vb_index;
101 	state->dirty_mask |= 1 << vb_index;
102 	r600_atom_dirty(rctx, &state->atom);
103 }
104 
105 const struct u_resource_vtbl r600_global_buffer_vtbl =
106 {
107 	u_default_resource_get_handle, /* get_handle */
108 	r600_compute_global_buffer_destroy, /* resource_destroy */
109 	r600_compute_global_get_transfer, /* get_transfer */
110 	r600_compute_global_transfer_destroy, /* transfer_destroy */
111 	r600_compute_global_transfer_map, /* transfer_map */
112 	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
113 	r600_compute_global_transfer_unmap, /* transfer_unmap */
114 	r600_compute_global_transfer_inline_write /* transfer_inline_write */
115 };
116 
117 
evergreen_create_compute_state(struct pipe_context * ctx_,const const struct pipe_compute_state * cso)118 void *evergreen_create_compute_state(
119 	struct pipe_context *ctx_,
120 	const const struct pipe_compute_state *cso)
121 {
122 	struct r600_context *ctx = (struct r600_context *)ctx_;
123 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
124 	void *p;
125 
126 #ifdef HAVE_OPENCL
127 	const struct pipe_llvm_program_header * header;
128 	const unsigned char * code;
129 
130 	COMPUTE_DBG("*** evergreen_create_compute_state\n");
131 
132 	header = cso->prog;
133 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
134 #endif
135 
136 	shader->ctx = (struct r600_context*)ctx;
137 	shader->resources = (struct evergreen_compute_resource*)
138 			CALLOC(sizeof(struct evergreen_compute_resource),
139 			get_compute_resource_num());
140 	shader->local_size = cso->req_local_mem; ///TODO: assert it
141 	shader->private_size = cso->req_private_mem;
142 	shader->input_size = cso->req_input_mem;
143 
144 #ifdef HAVE_OPENCL
145 	shader->mod = llvm_parse_bitcode(code, header->num_bytes);
146 
147 	r600_compute_shader_create(ctx_, shader->mod, &shader->bc);
148 #endif
149 	shader->shader_code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
150 							shader->bc.ndw * 4);
151 
152 	p = ctx->ws->buffer_map(shader->shader_code_bo->cs_buf, ctx->cs,
153 							PIPE_TRANSFER_WRITE);
154 
155 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
156 	ctx->ws->buffer_unmap(shader->shader_code_bo->cs_buf);
157 	return shader;
158 }
159 
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)160 void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
161 {
162 	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
163 
164 	free(shader->resources);
165 	free(shader);
166 }
167 
evergreen_bind_compute_state(struct pipe_context * ctx_,void * state)168 static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
169 {
170 	struct r600_context *ctx = (struct r600_context *)ctx_;
171 
172 	COMPUTE_DBG("*** evergreen_bind_compute_state\n");
173 
174 	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
175 }
176 
177 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
178  * kernel parameters there are inplicit parameters that need to be stored
179  * in the vertex buffer as well.  Here is how these parameters are organized in
180  * the buffer:
181  *
182  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
183  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
184  * DWORDS 6-8: Number of work items within each work group in each dimension
185  *             (x,y,z)
186  * DWORDS 9+ : Kernel parameters
187  */
evergreen_compute_upload_input(struct pipe_context * ctx_,const uint * block_layout,const uint * grid_layout,const void * input)188 void evergreen_compute_upload_input(
189 	struct pipe_context *ctx_,
190 	const uint *block_layout,
191 	const uint *grid_layout,
192 	const void *input)
193 {
194 	struct r600_context *ctx = (struct r600_context *)ctx_;
195 	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
196 	int i;
197 	unsigned kernel_parameters_offset_bytes = 36;
198 	uint32_t * num_work_groups_start;
199 	uint32_t * global_size_start;
200 	uint32_t * local_size_start;
201 	uint32_t * kernel_parameters_start;
202 
203 	if (shader->input_size == 0) {
204 		return;
205 	}
206 
207 	if (!shader->kernel_param) {
208 		unsigned buffer_size = shader->input_size;
209 
210 		/* Add space for the grid dimensions */
211 		buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
212 		shader->kernel_param = r600_compute_buffer_alloc_vram(
213 						ctx->screen, buffer_size);
214 	}
215 
216 	num_work_groups_start = ctx->ws->buffer_map(
217 		shader->kernel_param->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
218 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
219 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
220 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
221 
222 	/* Copy the work group size */
223 	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
224 
225 	/* Copy the global size */
226 	for (i = 0; i < 3; i++) {
227 		global_size_start[i] = grid_layout[i] * block_layout[i];
228 	}
229 
230 	/* Copy the local dimensions */
231 	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
232 
233 	/* Copy the kernel inputs */
234 	memcpy(kernel_parameters_start, input, shader->input_size);
235 
236 	for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
237 					(shader->input_size / 4); i++) {
238 		COMPUTE_DBG("input %i : %i\n", i,
239 			((unsigned*)num_work_groups_start)[i]);
240 	}
241 
242 	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
243 
244 	///ID=0 is reserved for the parameters
245 	evergreen_cs_set_vertex_buffer(ctx, 0, 0,
246 			(struct pipe_resource*)shader->kernel_param);
247 	///ID=0 is reserved for parameters
248 	evergreen_set_const_cache(shader, 0, shader->kernel_param,
249 						shader->input_size, 0);
250 }
251 
evergreen_emit_direct_dispatch(struct r600_context * rctx,const uint * block_layout,const uint * grid_layout)252 static void evergreen_emit_direct_dispatch(
253 		struct r600_context *rctx,
254 		const uint *block_layout, const uint *grid_layout)
255 {
256 	int i;
257 	struct radeon_winsys_cs *cs = rctx->cs;
258 	unsigned num_waves;
259 	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
260 	unsigned wave_divisor = (16 * num_pipes);
261 	int group_size = 1;
262 	int grid_size = 1;
263 	/* XXX: Enable lds and get size from cs_shader_state */
264 	unsigned lds_size = 0;
265 
266 	/* Calculate group_size/grid_size */
267 	for (i = 0; i < 3; i++) {
268 		group_size *= block_layout[i];
269 	}
270 
271 	for (i = 0; i < 3; i++)	{
272 		grid_size *= grid_layout[i];
273 	}
274 
275 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
276 	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
277 			wave_divisor - 1) / wave_divisor;
278 
279 	COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
280 							num_pipes, num_waves);
281 
282 	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
283 	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
284 	 * We may need to allocat the entire LDS space for Compute Shaders.
285 	 *
286 	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
287 	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
288 	 */
289 
290 	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
291 
292 	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
293 	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
294 	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
295 	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
296 
297 	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
298 								group_size);
299 
300 	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
301 	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
302 	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
303 	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
304 
305 	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
306 					lds_size | (num_waves << 14));
307 
308 	/* Dispatch packet */
309 	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
310 	r600_write_value(cs, grid_layout[0]);
311 	r600_write_value(cs, grid_layout[1]);
312 	r600_write_value(cs, grid_layout[2]);
313 	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
314 	r600_write_value(cs, 1);
315 }
316 
compute_emit_cs(struct r600_context * ctx,const uint * block_layout,const uint * grid_layout)317 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
318 		const uint *grid_layout)
319 {
320 	struct radeon_winsys_cs *cs = ctx->cs;
321 	int i;
322 
323 	struct r600_resource *onebo = NULL;
324 	struct r600_pipe_state *cb_state;
325 	struct evergreen_compute_resource *resources =
326 					ctx->cs_shader_state.shader->resources;
327 
328 	/* Initialize all the compute-related registers.
329 	 *
330 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
331 	 * of registers initialized by the start_compute_cs_cmd atom.
332 	 */
333 	r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom);
334 
335 	/* Emit cb_state */
336         cb_state = ctx->states[R600_PIPE_STATE_FRAMEBUFFER];
337 	r600_context_pipe_state_emit(ctx, cb_state, RADEON_CP_PACKET3_COMPUTE_MODE);
338 
339 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
340 	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
341 					ctx->compute_cb_target_mask);
342 
343 
344 	/* Emit vertex buffer state */
345 	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
346 	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
347 
348 	/* Emit compute shader state */
349 	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
350 
351 	for (i = 0; i < get_compute_resource_num(); i++) {
352 		if (resources[i].enabled) {
353 			int j;
354 			COMPUTE_DBG("resnum: %i, cdw: %i\n", i, cs->cdw);
355 
356 			for (j = 0; j < resources[i].cs_end; j++) {
357 				if (resources[i].do_reloc[j]) {
358 					assert(resources[i].bo);
359 					evergreen_emit_ctx_reloc(ctx,
360 						resources[i].bo,
361 						resources[i].usage);
362 				}
363 
364 				cs->buf[cs->cdw++] = resources[i].cs[j];
365 			}
366 
367 			if (resources[i].bo) {
368 				onebo = resources[i].bo;
369 				evergreen_emit_ctx_reloc(ctx,
370 					resources[i].bo,
371 					resources[i].usage);
372 
373 				///special case for textures
374 				if (resources[i].do_reloc
375 					[resources[i].cs_end] == 2) {
376 					evergreen_emit_ctx_reloc(ctx,
377 						resources[i].bo,
378 						resources[i].usage);
379 				}
380 			}
381 		}
382 	}
383 
384 	/* Emit dispatch state and dispatch packet */
385 	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
386 
387 	/* r600_flush_framebuffer() updates the cb_flush_flags and then
388 	 * calls r600_emit_atom() on the ctx->surface_sync_cmd.atom, which emits
389 	 * a SURFACE_SYNC packet via r600_emit_surface_sync().
390 	 *
391 	 * XXX r600_emit_surface_sync() hardcodes the CP_COHER_SIZE to
392 	 * 0xffffffff, so we will need to add a field to struct
393 	 * r600_surface_sync_cmd if we want to manually set this value.
394 	 */
395 	r600_flush_framebuffer(ctx, true /* Flush now */);
396 
397 #if 0
398 	COMPUTE_DBG("cdw: %i\n", cs->cdw);
399 	for (i = 0; i < cs->cdw; i++) {
400 		COMPUTE_DBG("%4i : 0x%08X\n", i, ctx->cs->buf[i]);
401 	}
402 #endif
403 
404 	ctx->ws->cs_flush(ctx->cs, RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE);
405 
406 	ctx->pm4_dirty_cdwords = 0;
407 	ctx->flags = 0;
408 
409 	COMPUTE_DBG("shader started\n");
410 
411 	ctx->ws->buffer_wait(onebo->buf, 0);
412 
413 	COMPUTE_DBG("...\n");
414 
415 	ctx->streamout_start = TRUE;
416 	ctx->streamout_append_bitmask = ~0;
417 
418 }
419 
420 
421 /**
422  * Emit function for r600_cs_shader_state atom
423  */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)424 void evergreen_emit_cs_shader(
425 		struct r600_context *rctx,
426 		struct r600_atom *atom)
427 {
428 	struct r600_cs_shader_state *state =
429 					(struct r600_cs_shader_state*)atom;
430 	struct r600_pipe_compute *shader = state->shader;
431 	struct radeon_winsys_cs *cs = rctx->cs;
432 	uint64_t va;
433 
434 	va = r600_resource_va(&rctx->screen->screen, &shader->shader_code_bo->b.b);
435 
436 	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
437 	r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
438 	r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
439 			S_0288D4_NUM_GPRS(shader->bc.ngpr)
440 			| S_0288D4_STACK_SIZE(shader->bc.nstack));
441 	r600_write_value(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
442 
443 	r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
444 	r600_write_value(cs, r600_context_bo_reloc(rctx, shader->shader_code_bo,
445 							RADEON_USAGE_READ));
446 
447 	r600_inval_shader_cache(rctx);
448 }
449 
evergreen_launch_grid(struct pipe_context * ctx_,const uint * block_layout,const uint * grid_layout,uint32_t pc,const void * input)450 static void evergreen_launch_grid(
451 		struct pipe_context *ctx_,
452 		const uint *block_layout, const uint *grid_layout,
453 		uint32_t pc, const void *input)
454 {
455 	struct r600_context *ctx = (struct r600_context *)ctx_;
456 
457 	COMPUTE_DBG("PC: %i\n", pc);
458 
459 	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
460 	compute_emit_cs(ctx, block_layout, grid_layout);
461 }
462 
evergreen_set_compute_resources(struct pipe_context * ctx_,unsigned start,unsigned count,struct pipe_surface ** surfaces)463 static void evergreen_set_compute_resources(struct pipe_context * ctx_,
464 		unsigned start, unsigned count,
465 		struct pipe_surface ** surfaces)
466 {
467 	struct r600_context *ctx = (struct r600_context *)ctx_;
468 	struct r600_surface **resources = (struct r600_surface **)surfaces;
469 
470 	COMPUTE_DBG("*** evergreen_set_compute_resources: start = %u count = %u\n",
471 			start, count);
472 
473 	for (int i = 0; i < count; i++)	{
474 		/* The First two vertex buffers are reserved for parameters and
475 		 * global buffers. */
476 		unsigned vtx_id = 2 + i;
477 		if (resources[i]) {
478 			struct r600_resource_global *buffer =
479 				(struct r600_resource_global*)
480 				resources[i]->base.texture;
481 			if (resources[i]->base.writable) {
482 				assert(i+1 < 12);
483 
484 				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
485 				(struct r600_resource *)resources[i]->base.texture,
486 				buffer->chunk->start_in_dw*4,
487 				resources[i]->base.texture->width0);
488 			}
489 
490 			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
491 					buffer->chunk->start_in_dw * 4,
492 					resources[i]->base.texture);
493 		}
494 	}
495 }
496 
evergreen_set_cs_sampler_view(struct pipe_context * ctx_,unsigned start_slot,unsigned count,struct pipe_sampler_view ** views)497 static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
498 		unsigned start_slot, unsigned count,
499 		struct pipe_sampler_view **views)
500 {
501 	struct r600_context *ctx = (struct r600_context *)ctx_;
502 	struct r600_pipe_sampler_view **resource =
503 		(struct r600_pipe_sampler_view **)views;
504 
505 	for (int i = 0; i < count; i++)	{
506 		if (resource[i]) {
507 			assert(i+1 < 12);
508 			///FETCH0 = VTX0 (param buffer),
509 			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
510 			evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
511 		}
512 	}
513 }
514 
evergreen_bind_compute_sampler_states(struct pipe_context * ctx_,unsigned start_slot,unsigned num_samplers,void ** samplers_)515 static void evergreen_bind_compute_sampler_states(
516 	struct pipe_context *ctx_,
517 	unsigned start_slot,
518 	unsigned num_samplers,
519 	void **samplers_)
520 {
521 	struct r600_context *ctx = (struct r600_context *)ctx_;
522 	struct compute_sampler_state ** samplers =
523 		(struct compute_sampler_state **)samplers_;
524 
525 	for (int i = 0; i < num_samplers; i++) {
526 		if (samplers[i]) {
527 			evergreen_set_sampler_resource(
528 				ctx->cs_shader_state.shader, samplers[i], i);
529 		}
530 	}
531 }
532 
evergreen_set_global_binding(struct pipe_context * ctx_,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)533 static void evergreen_set_global_binding(
534 	struct pipe_context *ctx_, unsigned first, unsigned n,
535 	struct pipe_resource **resources,
536 	uint32_t **handles)
537 {
538 	struct r600_context *ctx = (struct r600_context *)ctx_;
539 	struct compute_memory_pool *pool = ctx->screen->global_pool;
540 	struct r600_resource_global **buffers =
541 		(struct r600_resource_global **)resources;
542 
543 	COMPUTE_DBG("*** evergreen_set_global_binding first = %u n = %u\n",
544 			first, n);
545 
546 	if (!resources) {
547 		/* XXX: Unset */
548 		return;
549 	}
550 
551 	compute_memory_finalize_pending(pool, ctx_);
552 
553 	for (int i = 0; i < n; i++)
554 	{
555 		assert(resources[i]->target == PIPE_BUFFER);
556 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
557 
558 		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
559 	}
560 
561 	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
562 	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
563 				(struct pipe_resource*)pool->bo);
564 }
565 
566 /**
567  * This function initializes all the compute specific registers that need to
568  * be initialized for each compute command stream.  Registers that are common
569  * to both compute and 3D will be initialized at the beginning of each compute
570  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
571  * packet requires that the shader type bit be set, we must initialize all
572  * context registers needed for compute in this function.  The registers
573  * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
574  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
575  * on the GPU family.
576  */
evergreen_init_atom_start_compute_cs(struct r600_context * ctx)577 void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
578 {
579 	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
580 	int num_threads;
581 	int num_stack_entries;
582 
583 	/* since all required registers are initialised in the
584 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
585 	 */
586 	r600_init_command_buffer(cb, 256, EMIT_EARLY);
587 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
588 
589 	switch (ctx->family) {
590 	case CHIP_CEDAR:
591 	default:
592 		num_threads = 128;
593 		num_stack_entries = 256;
594 		break;
595 	case CHIP_REDWOOD:
596 		num_threads = 128;
597 		num_stack_entries = 256;
598 		break;
599 	case CHIP_JUNIPER:
600 		num_threads = 128;
601 		num_stack_entries = 512;
602 		break;
603 	case CHIP_CYPRESS:
604 	case CHIP_HEMLOCK:
605 		num_threads = 128;
606 		num_stack_entries = 512;
607 		break;
608 	case CHIP_PALM:
609 		num_threads = 128;
610 		num_stack_entries = 256;
611 		break;
612 	case CHIP_SUMO:
613 		num_threads = 128;
614 		num_stack_entries = 256;
615 		break;
616 	case CHIP_SUMO2:
617 		num_threads = 128;
618 		num_stack_entries = 512;
619 		break;
620 	case CHIP_BARTS:
621 		num_threads = 128;
622 		num_stack_entries = 512;
623 		break;
624 	case CHIP_TURKS:
625 		num_threads = 128;
626 		num_stack_entries = 256;
627 		break;
628 	case CHIP_CAICOS:
629 		num_threads = 128;
630 		num_stack_entries = 256;
631 		break;
632 	}
633 
634 	/* Config Registers */
635 	evergreen_init_common_regs(cb, ctx->chip_class
636 			, ctx->family, ctx->screen->info.drm_minor);
637 
638 	/* The primitive type always needs to be POINTLIST for compute. */
639 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
640 						V_008958_DI_PT_POINTLIST);
641 
642 	if (ctx->chip_class < CAYMAN) {
643 
644 		/* These registers control which simds can be used by each stage.
645 		 * The default for these registers is 0xffffffff, which means
646 		 * all simds are available for each stage.  It's possible we may
647 		 * want to play around with these in the future, but for now
648 		 * the default value is fine.
649 		 *
650 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
651 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
652 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
653 		 */
654 
655 		/* XXX: We may need to adjust the thread and stack resouce
656 		 * values for 3D/compute interop */
657 
658 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
659 
660 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
661 		 * Set the number of threads used by the PS/VS/GS/ES stage to
662 		 * 0.
663 		 */
664 		r600_store_value(cb, 0);
665 
666 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
667 		 * Set the number of threads used by the CS (aka LS) stage to
668 		 * the maximum number of threads and set the number of threads
669 		 * for the HS stage to 0. */
670 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
671 
672 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
673 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
674 		r600_store_value(cb, 0);
675 
676 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
677 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
678 		r600_store_value(cb, 0);
679 
680 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
681 		 * Set the Contol Flow stack entries to 0 for the HS stage, and
682 		 * set it to the maximum value for the CS (aka LS) stage. */
683 		r600_store_value(cb,
684 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
685 	}
686 
687 	/* Context Registers */
688 
689 	if (ctx->chip_class < CAYMAN) {
690 		/* workaround for hw issues with dyn gpr - must set all limits
691 		 * to 240 instead of 0, 0x1e == 240 / 8
692 		 */
693 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
694 				S_028838_PS_GPRS(0x1e) |
695 				S_028838_VS_GPRS(0x1e) |
696 				S_028838_GS_GPRS(0x1e) |
697 				S_028838_ES_GPRS(0x1e) |
698 				S_028838_HS_GPRS(0x1e) |
699 				S_028838_LS_GPRS(0x1e));
700 	}
701 
702 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
703 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
704 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
705 
706 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
707 
708 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
709 						S_0286E8_TID_IN_GROUP_ENA
710 						| S_0286E8_TGID_ENA
711 						| S_0286E8_DISABLE_INDEX_PACK)
712 						;
713 
714 	/* The LOOP_CONST registers are an optimizations for loops that allows
715 	 * you to store the initial counter, increment value, and maximum
716 	 * counter value in a register so that hardware can calculate the
717 	 * correct number of iterations for the loop, so that you don't need
718 	 * to have the loop counter in your shader code.  We don't currently use
719 	 * this optimization, so we must keep track of the counter in the
720 	 * shader and use a break instruction to exit loops.  However, the
721 	 * hardware will still uses this register to determine when to exit a
722 	 * loop, so we need to initialize the counter to 0, set the increment
723 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
724 	 * is the maximum value allowed.  This gives us a maximum of 4096
725 	 * iterations for our loops, but hopefully our break instruction will
726 	 * execute before some time before the 4096th iteration.
727 	 */
728 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
729 }
730 
evergreen_init_compute_state_functions(struct r600_context * ctx)731 void evergreen_init_compute_state_functions(struct r600_context *ctx)
732 {
733 	ctx->context.create_compute_state = evergreen_create_compute_state;
734 	ctx->context.delete_compute_state = evergreen_delete_compute_state;
735 	ctx->context.bind_compute_state = evergreen_bind_compute_state;
736 //	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
737 	ctx->context.set_compute_resources = evergreen_set_compute_resources;
738 	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
739 	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
740 	ctx->context.set_global_binding = evergreen_set_global_binding;
741 	ctx->context.launch_grid = evergreen_launch_grid;
742 
743 	/* We always use at least two vertex buffers for compute, one for
744          * parameters and one for global memory */
745 	ctx->cs_vertex_buffer_state.enabled_mask =
746 	ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2;
747 }
748 
749 
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)750 struct pipe_resource *r600_compute_global_buffer_create(
751 	struct pipe_screen *screen,
752 	const struct pipe_resource *templ)
753 {
754 	assert(templ->target == PIPE_BUFFER);
755 	assert(templ->bind & PIPE_BIND_GLOBAL);
756 	assert(templ->array_size == 1 || templ->array_size == 0);
757 	assert(templ->depth0 == 1 || templ->depth0 == 0);
758 	assert(templ->height0 == 1 || templ->height0 == 0);
759 
760 	struct r600_resource_global* result = (struct r600_resource_global*)
761 		CALLOC(sizeof(struct r600_resource_global), 1);
762 	struct r600_screen* rscreen = (struct r600_screen*)screen;
763 
764 	COMPUTE_DBG("*** r600_compute_global_buffer_create\n");
765 	COMPUTE_DBG("width = %u array_size = %u\n", templ->width0,
766 			templ->array_size);
767 
768 	result->base.b.vtbl = &r600_global_buffer_vtbl;
769 	result->base.b.b.screen = screen;
770 	result->base.b.b = *templ;
771 	pipe_reference_init(&result->base.b.b.reference, 1);
772 
773 	int size_in_dw = (templ->width0+3) / 4;
774 
775 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
776 
777 	if (result->chunk == NULL)
778 	{
779 		free(result);
780 		return NULL;
781 	}
782 
783 	return &result->base.b.b;
784 }
785 
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)786 void r600_compute_global_buffer_destroy(
787 	struct pipe_screen *screen,
788 	struct pipe_resource *res)
789 {
790 	assert(res->target == PIPE_BUFFER);
791 	assert(res->bind & PIPE_BIND_GLOBAL);
792 
793 	struct r600_resource_global* buffer = (struct r600_resource_global*)res;
794 	struct r600_screen* rscreen = (struct r600_screen*)screen;
795 
796 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
797 
798 	buffer->chunk = NULL;
799 	free(res);
800 }
801 
r600_compute_global_transfer_map(struct pipe_context * ctx_,struct pipe_transfer * transfer)802 void* r600_compute_global_transfer_map(
803 	struct pipe_context *ctx_,
804 	struct pipe_transfer* transfer)
805 {
806 	assert(transfer->resource->target == PIPE_BUFFER);
807 	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
808 	assert(transfer->box.x >= 0);
809 	assert(transfer->box.y == 0);
810 	assert(transfer->box.z == 0);
811 
812 	struct r600_context *ctx = (struct r600_context *)ctx_;
813 	struct r600_resource_global* buffer =
814 		(struct r600_resource_global*)transfer->resource;
815 
816 	uint32_t* map;
817 	///TODO: do it better, mapping is not possible if the pool is too big
818 
819 	if (!(map = ctx->ws->buffer_map(buffer->chunk->pool->bo->cs_buf,
820 						ctx->cs, transfer->usage))) {
821 		return NULL;
822 	}
823 
824 	COMPUTE_DBG("buffer start: %lli\n", buffer->chunk->start_in_dw);
825 	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
826 }
827 
r600_compute_global_transfer_unmap(struct pipe_context * ctx_,struct pipe_transfer * transfer)828 void r600_compute_global_transfer_unmap(
829 	struct pipe_context *ctx_,
830 	struct pipe_transfer* transfer)
831 {
832 	assert(transfer->resource->target == PIPE_BUFFER);
833 	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
834 
835 	struct r600_context *ctx = (struct r600_context *)ctx_;
836 	struct r600_resource_global* buffer =
837 		(struct r600_resource_global*)transfer->resource;
838 
839 	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
840 }
841 
r600_compute_global_get_transfer(struct pipe_context * ctx_,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box)842 struct pipe_transfer * r600_compute_global_get_transfer(
843 	struct pipe_context *ctx_,
844 	struct pipe_resource *resource,
845 	unsigned level,
846 	unsigned usage,
847 	const struct pipe_box *box)
848 {
849 	struct r600_context *ctx = (struct r600_context *)ctx_;
850 	struct compute_memory_pool *pool = ctx->screen->global_pool;
851 
852 	compute_memory_finalize_pending(pool, ctx_);
853 
854 	assert(resource->target == PIPE_BUFFER);
855 	struct r600_context *rctx = (struct r600_context*)ctx_;
856 	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
857 
858 	transfer->resource = resource;
859 	transfer->level = level;
860 	transfer->usage = usage;
861 	transfer->box = *box;
862 	transfer->stride = 0;
863 	transfer->layer_stride = 0;
864 	transfer->data = NULL;
865 
866 	/* Note strides are zero, this is ok for buffers, but not for
867 	* textures 2d & higher at least.
868 	*/
869 	return transfer;
870 }
871 
r600_compute_global_transfer_destroy(struct pipe_context * ctx_,struct pipe_transfer * transfer)872 void r600_compute_global_transfer_destroy(
873 	struct pipe_context *ctx_,
874 	struct pipe_transfer *transfer)
875 {
876 	struct r600_context *rctx = (struct r600_context*)ctx_;
877 	util_slab_free(&rctx->pool_transfers, transfer);
878 }
879 
r600_compute_global_transfer_flush_region(struct pipe_context * ctx_,struct pipe_transfer * transfer,const struct pipe_box * box)880 void r600_compute_global_transfer_flush_region(
881 	struct pipe_context *ctx_,
882 	struct pipe_transfer *transfer,
883 	const struct pipe_box *box)
884 {
885 	assert(0 && "TODO");
886 }
887 
r600_compute_global_transfer_inline_write(struct pipe_context * pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,const void * data,unsigned stride,unsigned layer_stride)888 void r600_compute_global_transfer_inline_write(
889 	struct pipe_context *pipe,
890 	struct pipe_resource *resource,
891 	unsigned level,
892 	unsigned usage,
893 	const struct pipe_box *box,
894 	const void *data,
895 	unsigned stride,
896 	unsigned layer_stride)
897 {
898 	assert(0 && "TODO");
899 }
900