1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Jerome Glisse
25  */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
29 #include "sid.h"
30 #include "util/u_memory.h"
31 #include <errno.h>
32 
33 #define GROUP_FORCE_NEW_BLOCK	0
34 
35 /* Get backends mask */
si_get_backend_mask(struct r600_context * ctx)36 void si_get_backend_mask(struct r600_context *ctx)
37 {
38 	struct radeon_winsys_cs *cs = ctx->cs;
39 	struct si_resource *buffer;
40 	uint32_t *results;
41 	unsigned num_backends = ctx->screen->info.r600_num_backends;
42 	unsigned i, mask = 0;
43 
44 	/* if backend_map query is supported by the kernel */
45 	if (ctx->screen->info.r600_backend_map_valid) {
46 		unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47 		unsigned backend_map = ctx->screen->info.r600_backend_map;
48 		unsigned item_width, item_mask;
49 
50 		if (ctx->chip_class >= CAYMAN) {
51 			item_width = 4;
52 			item_mask = 0x7;
53 		}
54 
55 		while(num_tile_pipes--) {
56 			i = backend_map & item_mask;
57 			mask |= (1<<i);
58 			backend_map >>= item_width;
59 		}
60 		if (mask != 0) {
61 			ctx->backend_mask = mask;
62 			return;
63 		}
64 	}
65 
66 	/* otherwise backup path for older kernels */
67 
68 	/* create buffer for event data */
69 	buffer = si_resource_create_custom(&ctx->screen->screen,
70 					   PIPE_USAGE_STAGING,
71 					   ctx->max_db*16);
72 	if (!buffer)
73 		goto err;
74 
75 	/* initialize buffer with zeroes */
76 	results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
77 	if (results) {
78 		uint64_t va = 0;
79 
80 		memset(results, 0, ctx->max_db * 4 * 4);
81 		ctx->ws->buffer_unmap(buffer->cs_buf);
82 
83 		/* emit EVENT_WRITE for ZPASS_DONE */
84 		va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
85 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87 		cs->buf[cs->cdw++] = va;
88 		cs->buf[cs->cdw++] = va >> 32;
89 
90 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91 		cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92 
93 		/* analyze results */
94 		results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95 		if (results) {
96 			for(i = 0; i < ctx->max_db; i++) {
97 				/* at least highest bit will be set if backend is used */
98 				if (results[i*4 + 1])
99 					mask |= (1<<i);
100 			}
101 			ctx->ws->buffer_unmap(buffer->cs_buf);
102 		}
103 	}
104 
105 	si_resource_reference(&buffer, NULL);
106 
107 	if (mask != 0) {
108 		ctx->backend_mask = mask;
109 		return;
110 	}
111 
112 err:
113 	/* fallback to old method - set num_backends lower bits to 1 */
114 	ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115 	return;
116 }
117 
118 /* initialize */
si_need_cs_space(struct r600_context * ctx,unsigned num_dw,boolean count_draw_in)119 void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
120 			boolean count_draw_in)
121 {
122 	/* The number of dwords we already used in the CS so far. */
123 	num_dw += ctx->cs->cdw;
124 
125 	if (count_draw_in) {
126 		/* The number of dwords all the dirty states would take. */
127 		num_dw += ctx->pm4_dirty_cdwords;
128 
129 		/* The upper-bound of how much a draw command would take. */
130 		num_dw += SI_MAX_DRAW_CS_DWORDS;
131 	}
132 
133 	/* Count in queries_suspend. */
134 	num_dw += ctx->num_cs_dw_queries_suspend;
135 
136 	/* Count in streamout_end at the end of CS. */
137 	num_dw += ctx->num_cs_dw_streamout_end;
138 
139 	/* Count in render_condition(NULL) at the end of CS. */
140 	if (ctx->predicate_drawing) {
141 		num_dw += 3;
142 	}
143 
144 	/* Count in framebuffer cache flushes at the end of CS. */
145 	num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
146 
147 	/* Save 16 dwords for the fence mechanism. */
148 	num_dw += 16;
149 
150 	/* Flush if there's not enough space. */
151 	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
152 		radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
153 	}
154 }
155 
r600_flush_framebuffer(struct r600_context * ctx)156 static void r600_flush_framebuffer(struct r600_context *ctx)
157 {
158 	struct si_pm4_state *pm4;
159 
160 	if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
161 		return;
162 
163 	pm4 = CALLOC_STRUCT(si_pm4_state);
164 	si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
165 				S_0085F0_CB1_DEST_BASE_ENA(1) |
166 				S_0085F0_CB2_DEST_BASE_ENA(1) |
167 				S_0085F0_CB3_DEST_BASE_ENA(1) |
168 				S_0085F0_CB4_DEST_BASE_ENA(1) |
169 				S_0085F0_CB5_DEST_BASE_ENA(1) |
170 				S_0085F0_CB6_DEST_BASE_ENA(1) |
171 				S_0085F0_CB7_DEST_BASE_ENA(1) |
172 				S_0085F0_DB_ACTION_ENA(1) |
173 				S_0085F0_DB_DEST_BASE_ENA(1));
174 	si_pm4_emit(ctx, pm4);
175 	si_pm4_free_state(ctx, pm4, ~0);
176 
177 	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
178 }
179 
si_context_flush(struct r600_context * ctx,unsigned flags)180 void si_context_flush(struct r600_context *ctx, unsigned flags)
181 {
182 	struct radeon_winsys_cs *cs = ctx->cs;
183 	bool queries_suspended = false;
184 
185 #if 0
186 	bool streamout_suspended = false;
187 #endif
188 
189 	if (!cs->cdw)
190 		return;
191 
192 	/* suspend queries */
193 	if (ctx->num_cs_dw_queries_suspend) {
194 		r600_context_queries_suspend(ctx);
195 		queries_suspended = true;
196 	}
197 
198 #if 0
199 	if (ctx->num_cs_dw_streamout_end) {
200 		r600_context_streamout_end(ctx);
201 		streamout_suspended = true;
202 	}
203 #endif
204 
205 	r600_flush_framebuffer(ctx);
206 
207 	/* partial flush is needed to avoid lockups on some chips with user fences */
208 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
209 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
210 
211 	/* force to keep tiling flags */
212 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
213 
214 	/* Flush the CS. */
215 	ctx->ws->cs_flush(ctx->cs, flags);
216 
217 	ctx->pm4_dirty_cdwords = 0;
218 	ctx->flags = 0;
219 
220 #if 0
221 	if (streamout_suspended) {
222 		ctx->streamout_start = TRUE;
223 		ctx->streamout_append_bitmask = ~0;
224 	}
225 #endif
226 
227 	/* resume queries */
228 	if (queries_suspended) {
229 		r600_context_queries_resume(ctx);
230 	}
231 
232 	/* set all valid group as dirty so they get reemited on
233 	 * next draw command
234 	 */
235 	si_pm4_reset_emitted(ctx);
236 }
237 
si_context_emit_fence(struct r600_context * ctx,struct si_resource * fence_bo,unsigned offset,unsigned value)238 void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
239 {
240 	struct radeon_winsys_cs *cs = ctx->cs;
241 	uint64_t va;
242 
243 	si_need_cs_space(ctx, 10, FALSE);
244 
245 	va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
246 	va = va + (offset << 2);
247 
248 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
249 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
250 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
251 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
252 	cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
253 	/* DATA_SEL | INT_EN | ADDRESS_HI */
254 	cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
255 	cs->buf[cs->cdw++] = value;                   /* DATA_LO */
256 	cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
257 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
258 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
259 }
260 
r600_query_read_result(char * map,unsigned start_index,unsigned end_index,bool test_status_bit)261 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
262 				       bool test_status_bit)
263 {
264 	uint32_t *current_result = (uint32_t*)map;
265 	uint64_t start, end;
266 
267 	start = (uint64_t)current_result[start_index] |
268 		(uint64_t)current_result[start_index+1] << 32;
269 	end = (uint64_t)current_result[end_index] |
270 	      (uint64_t)current_result[end_index+1] << 32;
271 
272 	if (!test_status_bit ||
273 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
274 		return end - start;
275 	}
276 	return 0;
277 }
278 
r600_query_result(struct r600_context * ctx,struct r600_query * query,boolean wait)279 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
280 {
281 	unsigned results_base = query->results_start;
282 	char *map;
283 
284 	map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
285 				  PIPE_TRANSFER_READ |
286 				  (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
287 	if (!map)
288 		return FALSE;
289 
290 	/* count all results across all data blocks */
291 	switch (query->type) {
292 	case PIPE_QUERY_OCCLUSION_COUNTER:
293 		while (results_base != query->results_end) {
294 			query->result.u64 +=
295 				r600_query_read_result(map + results_base, 0, 2, true);
296 			results_base = (results_base + 16) % query->buffer->b.b.width0;
297 		}
298 		break;
299 	case PIPE_QUERY_OCCLUSION_PREDICATE:
300 		while (results_base != query->results_end) {
301 			query->result.b = query->result.b ||
302 				r600_query_read_result(map + results_base, 0, 2, true) != 0;
303 			results_base = (results_base + 16) % query->buffer->b.b.width0;
304 		}
305 		break;
306 	case PIPE_QUERY_TIME_ELAPSED:
307 		while (results_base != query->results_end) {
308 			query->result.u64 +=
309 				r600_query_read_result(map + results_base, 0, 2, false);
310 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
311 		}
312 		break;
313 	case PIPE_QUERY_PRIMITIVES_EMITTED:
314 		/* SAMPLE_STREAMOUTSTATS stores this structure:
315 		 * {
316 		 *    u64 NumPrimitivesWritten;
317 		 *    u64 PrimitiveStorageNeeded;
318 		 * }
319 		 * We only need NumPrimitivesWritten here. */
320 		while (results_base != query->results_end) {
321 			query->result.u64 +=
322 				r600_query_read_result(map + results_base, 2, 6, true);
323 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
324 		}
325 		break;
326 	case PIPE_QUERY_PRIMITIVES_GENERATED:
327 		/* Here we read PrimitiveStorageNeeded. */
328 		while (results_base != query->results_end) {
329 			query->result.u64 +=
330 				r600_query_read_result(map + results_base, 0, 4, true);
331 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
332 		}
333 		break;
334 	case PIPE_QUERY_SO_STATISTICS:
335 		while (results_base != query->results_end) {
336 			query->result.so.num_primitives_written +=
337 				r600_query_read_result(map + results_base, 2, 6, true);
338 			query->result.so.primitives_storage_needed +=
339 				r600_query_read_result(map + results_base, 0, 4, true);
340 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
341 		}
342 		break;
343 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
344 		while (results_base != query->results_end) {
345 			query->result.b = query->result.b ||
346 				r600_query_read_result(map + results_base, 2, 6, true) !=
347 				r600_query_read_result(map + results_base, 0, 4, true);
348 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
349 		}
350 		break;
351 	default:
352 		assert(0);
353 	}
354 
355 	query->results_start = query->results_end;
356 	ctx->ws->buffer_unmap(query->buffer->cs_buf);
357 	return TRUE;
358 }
359 
r600_query_begin(struct r600_context * ctx,struct r600_query * query)360 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
361 {
362 	struct radeon_winsys_cs *cs = ctx->cs;
363 	unsigned new_results_end, i;
364 	uint32_t *results;
365 	uint64_t va;
366 
367 	si_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
368 
369 	new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
370 
371 	/* collect current results if query buffer is full */
372 	if (new_results_end == query->results_start) {
373 		r600_query_result(ctx, query, TRUE);
374 	}
375 
376 	switch (query->type) {
377 	case PIPE_QUERY_OCCLUSION_COUNTER:
378 	case PIPE_QUERY_OCCLUSION_PREDICATE:
379 		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
380 		if (results) {
381 			results = (uint32_t*)((char*)results + query->results_end);
382 			memset(results, 0, query->result_size);
383 
384 			/* Set top bits for unused backends */
385 			for (i = 0; i < ctx->max_db; i++) {
386 				if (!(ctx->backend_mask & (1<<i))) {
387 					results[(i * 4)+1] = 0x80000000;
388 					results[(i * 4)+3] = 0x80000000;
389 				}
390 			}
391 			ctx->ws->buffer_unmap(query->buffer->cs_buf);
392 		}
393 		break;
394 	case PIPE_QUERY_TIME_ELAPSED:
395 		break;
396 	case PIPE_QUERY_PRIMITIVES_EMITTED:
397 	case PIPE_QUERY_PRIMITIVES_GENERATED:
398 	case PIPE_QUERY_SO_STATISTICS:
399 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
400 		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
401 		results = (uint32_t*)((char*)results + query->results_end);
402 		memset(results, 0, query->result_size);
403 		ctx->ws->buffer_unmap(query->buffer->cs_buf);
404 		break;
405 	default:
406 		assert(0);
407 	}
408 
409 	/* emit begin query */
410 	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
411 	va += query->results_end;
412 
413 	switch (query->type) {
414 	case PIPE_QUERY_OCCLUSION_COUNTER:
415 	case PIPE_QUERY_OCCLUSION_PREDICATE:
416 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
417 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
418 		cs->buf[cs->cdw++] = va;
419 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
420 		break;
421 	case PIPE_QUERY_PRIMITIVES_EMITTED:
422 	case PIPE_QUERY_PRIMITIVES_GENERATED:
423 	case PIPE_QUERY_SO_STATISTICS:
424 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
425 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
426 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
427 		cs->buf[cs->cdw++] = query->results_end;
428 		cs->buf[cs->cdw++] = 0;
429 		break;
430 	case PIPE_QUERY_TIME_ELAPSED:
431 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
432 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
433 		cs->buf[cs->cdw++] = va;
434 		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
435 		cs->buf[cs->cdw++] = 0;
436 		cs->buf[cs->cdw++] = 0;
437 		break;
438 	default:
439 		assert(0);
440 	}
441 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
442 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
443 
444 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
445 }
446 
r600_query_end(struct r600_context * ctx,struct r600_query * query)447 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
448 {
449 	struct radeon_winsys_cs *cs = ctx->cs;
450 	uint64_t va;
451 
452 	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
453 	/* emit end query */
454 	switch (query->type) {
455 	case PIPE_QUERY_OCCLUSION_COUNTER:
456 	case PIPE_QUERY_OCCLUSION_PREDICATE:
457 		va += query->results_end + 8;
458 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
459 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
460 		cs->buf[cs->cdw++] = va;
461 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
462 		break;
463 	case PIPE_QUERY_PRIMITIVES_EMITTED:
464 	case PIPE_QUERY_PRIMITIVES_GENERATED:
465 	case PIPE_QUERY_SO_STATISTICS:
466 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
467 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
468 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
469 		cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
470 		cs->buf[cs->cdw++] = 0;
471 		break;
472 	case PIPE_QUERY_TIME_ELAPSED:
473 		va += query->results_end + query->result_size/2;
474 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
475 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
476 		cs->buf[cs->cdw++] = va;
477 		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
478 		cs->buf[cs->cdw++] = 0;
479 		cs->buf[cs->cdw++] = 0;
480 		break;
481 	default:
482 		assert(0);
483 	}
484 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
485 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
486 
487 	query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
488 	ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
489 }
490 
r600_query_predication(struct r600_context * ctx,struct r600_query * query,int operation,int flag_wait)491 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
492 			    int flag_wait)
493 {
494 	struct radeon_winsys_cs *cs = ctx->cs;
495 	uint64_t va;
496 
497 	if (operation == PREDICATION_OP_CLEAR) {
498 		si_need_cs_space(ctx, 3, FALSE);
499 
500 		cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
501 		cs->buf[cs->cdw++] = 0;
502 		cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
503 	} else {
504 		unsigned results_base = query->results_start;
505 		unsigned count;
506 		uint32_t op;
507 
508 		/* find count of the query data blocks */
509 		count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
510 		count /= query->result_size;
511 
512 		si_need_cs_space(ctx, 5 * count, TRUE);
513 
514 		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
515 				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
516 		va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
517 
518 		/* emit predicate packets for all data blocks */
519 		while (results_base != query->results_end) {
520 			cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
521 			cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
522 			cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
523 			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
524 			cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
525 									     RADEON_USAGE_READ);
526 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
527 
528 			/* set CONTINUE bit for all packets except the first */
529 			op |= PREDICATION_CONTINUE;
530 		}
531 	}
532 }
533 
r600_context_query_create(struct r600_context * ctx,unsigned query_type)534 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
535 {
536 	struct r600_query *query;
537 	unsigned buffer_size = 4096;
538 
539 	query = CALLOC_STRUCT(r600_query);
540 	if (query == NULL)
541 		return NULL;
542 
543 	query->type = query_type;
544 
545 	switch (query_type) {
546 	case PIPE_QUERY_OCCLUSION_COUNTER:
547 	case PIPE_QUERY_OCCLUSION_PREDICATE:
548 		query->result_size = 16 * ctx->max_db;
549 		query->num_cs_dw = 6;
550 		break;
551 	case PIPE_QUERY_TIME_ELAPSED:
552 		query->result_size = 16;
553 		query->num_cs_dw = 8;
554 		break;
555 	case PIPE_QUERY_PRIMITIVES_EMITTED:
556 	case PIPE_QUERY_PRIMITIVES_GENERATED:
557 	case PIPE_QUERY_SO_STATISTICS:
558 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
559 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
560 		query->result_size = 32;
561 		query->num_cs_dw = 6;
562 		break;
563 	default:
564 		assert(0);
565 		FREE(query);
566 		return NULL;
567 	}
568 
569 	/* adjust buffer size to simplify offsets wrapping math */
570 	buffer_size -= buffer_size % query->result_size;
571 
572 	/* Queries are normally read by the CPU after
573 	 * being written by the gpu, hence staging is probably a good
574 	 * usage pattern.
575 	 */
576 	query->buffer = si_resource_create_custom(&ctx->screen->screen,
577 						  PIPE_USAGE_STAGING,
578 						  buffer_size);
579 	if (!query->buffer) {
580 		FREE(query);
581 		return NULL;
582 	}
583 	return query;
584 }
585 
r600_context_query_destroy(struct r600_context * ctx,struct r600_query * query)586 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
587 {
588 	si_resource_reference(&query->buffer, NULL);
589 	free(query);
590 }
591 
r600_context_query_result(struct r600_context * ctx,struct r600_query * query,boolean wait,void * vresult)592 boolean r600_context_query_result(struct r600_context *ctx,
593 				struct r600_query *query,
594 				boolean wait, void *vresult)
595 {
596 	boolean *result_b = (boolean*)vresult;
597 	uint64_t *result_u64 = (uint64_t*)vresult;
598 	struct pipe_query_data_so_statistics *result_so =
599 		(struct pipe_query_data_so_statistics*)vresult;
600 
601 	if (!r600_query_result(ctx, query, wait))
602 		return FALSE;
603 
604 	switch (query->type) {
605 	case PIPE_QUERY_OCCLUSION_COUNTER:
606 	case PIPE_QUERY_PRIMITIVES_EMITTED:
607 	case PIPE_QUERY_PRIMITIVES_GENERATED:
608 		*result_u64 = query->result.u64;
609 		break;
610 	case PIPE_QUERY_OCCLUSION_PREDICATE:
611 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
612 		*result_b = query->result.b;
613 		break;
614 	case PIPE_QUERY_TIME_ELAPSED:
615 		*result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
616 		break;
617 	case PIPE_QUERY_SO_STATISTICS:
618 		*result_so = query->result.so;
619 		break;
620 	default:
621 		assert(0);
622 	}
623 	return TRUE;
624 }
625 
r600_context_queries_suspend(struct r600_context * ctx)626 void r600_context_queries_suspend(struct r600_context *ctx)
627 {
628 	struct r600_query *query;
629 
630 	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
631 		r600_query_end(ctx, query);
632 	}
633 	assert(ctx->num_cs_dw_queries_suspend == 0);
634 }
635 
r600_context_queries_resume(struct r600_context * ctx)636 void r600_context_queries_resume(struct r600_context *ctx)
637 {
638 	struct r600_query *query;
639 
640 	assert(ctx->num_cs_dw_queries_suspend == 0);
641 
642 	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
643 		r600_query_begin(ctx, query);
644 	}
645 }
646 
r600_context_draw_opaque_count(struct r600_context * ctx,struct r600_so_target * t)647 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
648 {
649 	struct radeon_winsys_cs *cs = ctx->cs;
650 	si_need_cs_space(ctx, 14 + 21, TRUE);
651 
652 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
653 	cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
654 	cs->buf[cs->cdw++] = 0;
655 
656 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
657 	cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
658 	cs->buf[cs->cdw++] = t->stride >> 2;
659 
660 #if 0
661 	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
662 	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
663 	cs->buf[cs->cdw++] = 0; /* src address lo */
664 	cs->buf[cs->cdw++] = 0; /* src address hi */
665 	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
666 	cs->buf[cs->cdw++] = 0; /* unused */
667 #endif
668 
669 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
670 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
671 
672 #if 0 /* I have not found this useful yet. */
673 	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
674 	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
675 	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
676 	cs->buf[cs->cdw++] = 0; /* unused */
677 	cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
678 	cs->buf[cs->cdw++] = 0; /* unused */
679 
680 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
681 	cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
682 	cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
683 
684 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
685 	cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
686 	cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
687 
688 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
689 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
690 							     RADEON_USAGE_WRITE);
691 
692 	cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
693 	cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
694 	cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
695 	cs->buf[cs->cdw++] = 0;
696 	cs->buf[cs->cdw++] = 0; /* reference value */
697 	cs->buf[cs->cdw++] = 0xffffffff; /* mask */
698 	cs->buf[cs->cdw++] = 4; /* poll interval */
699 #endif
700 }
701