1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
29 #include "sid.h"
30 #include "util/u_memory.h"
31 #include <errno.h>
32
33 #define GROUP_FORCE_NEW_BLOCK 0
34
35 /* Get backends mask */
si_get_backend_mask(struct r600_context * ctx)36 void si_get_backend_mask(struct r600_context *ctx)
37 {
38 struct radeon_winsys_cs *cs = ctx->cs;
39 struct si_resource *buffer;
40 uint32_t *results;
41 unsigned num_backends = ctx->screen->info.r600_num_backends;
42 unsigned i, mask = 0;
43
44 /* if backend_map query is supported by the kernel */
45 if (ctx->screen->info.r600_backend_map_valid) {
46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47 unsigned backend_map = ctx->screen->info.r600_backend_map;
48 unsigned item_width, item_mask;
49
50 if (ctx->chip_class >= CAYMAN) {
51 item_width = 4;
52 item_mask = 0x7;
53 }
54
55 while(num_tile_pipes--) {
56 i = backend_map & item_mask;
57 mask |= (1<<i);
58 backend_map >>= item_width;
59 }
60 if (mask != 0) {
61 ctx->backend_mask = mask;
62 return;
63 }
64 }
65
66 /* otherwise backup path for older kernels */
67
68 /* create buffer for event data */
69 buffer = si_resource_create_custom(&ctx->screen->screen,
70 PIPE_USAGE_STAGING,
71 ctx->max_db*16);
72 if (!buffer)
73 goto err;
74
75 /* initialize buffer with zeroes */
76 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
77 if (results) {
78 uint64_t va = 0;
79
80 memset(results, 0, ctx->max_db * 4 * 4);
81 ctx->ws->buffer_unmap(buffer->cs_buf);
82
83 /* emit EVENT_WRITE for ZPASS_DONE */
84 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87 cs->buf[cs->cdw++] = va;
88 cs->buf[cs->cdw++] = va >> 32;
89
90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92
93 /* analyze results */
94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95 if (results) {
96 for(i = 0; i < ctx->max_db; i++) {
97 /* at least highest bit will be set if backend is used */
98 if (results[i*4 + 1])
99 mask |= (1<<i);
100 }
101 ctx->ws->buffer_unmap(buffer->cs_buf);
102 }
103 }
104
105 si_resource_reference(&buffer, NULL);
106
107 if (mask != 0) {
108 ctx->backend_mask = mask;
109 return;
110 }
111
112 err:
113 /* fallback to old method - set num_backends lower bits to 1 */
114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115 return;
116 }
117
118 /* initialize */
si_need_cs_space(struct r600_context * ctx,unsigned num_dw,boolean count_draw_in)119 void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
120 boolean count_draw_in)
121 {
122 /* The number of dwords we already used in the CS so far. */
123 num_dw += ctx->cs->cdw;
124
125 if (count_draw_in) {
126 /* The number of dwords all the dirty states would take. */
127 num_dw += ctx->pm4_dirty_cdwords;
128
129 /* The upper-bound of how much a draw command would take. */
130 num_dw += SI_MAX_DRAW_CS_DWORDS;
131 }
132
133 /* Count in queries_suspend. */
134 num_dw += ctx->num_cs_dw_queries_suspend;
135
136 /* Count in streamout_end at the end of CS. */
137 num_dw += ctx->num_cs_dw_streamout_end;
138
139 /* Count in render_condition(NULL) at the end of CS. */
140 if (ctx->predicate_drawing) {
141 num_dw += 3;
142 }
143
144 /* Count in framebuffer cache flushes at the end of CS. */
145 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
146
147 /* Save 16 dwords for the fence mechanism. */
148 num_dw += 16;
149
150 /* Flush if there's not enough space. */
151 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
152 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
153 }
154 }
155
r600_flush_framebuffer(struct r600_context * ctx)156 static void r600_flush_framebuffer(struct r600_context *ctx)
157 {
158 struct si_pm4_state *pm4;
159
160 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
161 return;
162
163 pm4 = CALLOC_STRUCT(si_pm4_state);
164 si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
165 S_0085F0_CB1_DEST_BASE_ENA(1) |
166 S_0085F0_CB2_DEST_BASE_ENA(1) |
167 S_0085F0_CB3_DEST_BASE_ENA(1) |
168 S_0085F0_CB4_DEST_BASE_ENA(1) |
169 S_0085F0_CB5_DEST_BASE_ENA(1) |
170 S_0085F0_CB6_DEST_BASE_ENA(1) |
171 S_0085F0_CB7_DEST_BASE_ENA(1) |
172 S_0085F0_DB_ACTION_ENA(1) |
173 S_0085F0_DB_DEST_BASE_ENA(1));
174 si_pm4_emit(ctx, pm4);
175 si_pm4_free_state(ctx, pm4, ~0);
176
177 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
178 }
179
si_context_flush(struct r600_context * ctx,unsigned flags)180 void si_context_flush(struct r600_context *ctx, unsigned flags)
181 {
182 struct radeon_winsys_cs *cs = ctx->cs;
183 bool queries_suspended = false;
184
185 #if 0
186 bool streamout_suspended = false;
187 #endif
188
189 if (!cs->cdw)
190 return;
191
192 /* suspend queries */
193 if (ctx->num_cs_dw_queries_suspend) {
194 r600_context_queries_suspend(ctx);
195 queries_suspended = true;
196 }
197
198 #if 0
199 if (ctx->num_cs_dw_streamout_end) {
200 r600_context_streamout_end(ctx);
201 streamout_suspended = true;
202 }
203 #endif
204
205 r600_flush_framebuffer(ctx);
206
207 /* partial flush is needed to avoid lockups on some chips with user fences */
208 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
209 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
210
211 /* force to keep tiling flags */
212 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
213
214 /* Flush the CS. */
215 ctx->ws->cs_flush(ctx->cs, flags);
216
217 ctx->pm4_dirty_cdwords = 0;
218 ctx->flags = 0;
219
220 #if 0
221 if (streamout_suspended) {
222 ctx->streamout_start = TRUE;
223 ctx->streamout_append_bitmask = ~0;
224 }
225 #endif
226
227 /* resume queries */
228 if (queries_suspended) {
229 r600_context_queries_resume(ctx);
230 }
231
232 /* set all valid group as dirty so they get reemited on
233 * next draw command
234 */
235 si_pm4_reset_emitted(ctx);
236 }
237
si_context_emit_fence(struct r600_context * ctx,struct si_resource * fence_bo,unsigned offset,unsigned value)238 void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
239 {
240 struct radeon_winsys_cs *cs = ctx->cs;
241 uint64_t va;
242
243 si_need_cs_space(ctx, 10, FALSE);
244
245 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
246 va = va + (offset << 2);
247
248 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
249 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
250 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
251 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
252 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
253 /* DATA_SEL | INT_EN | ADDRESS_HI */
254 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
255 cs->buf[cs->cdw++] = value; /* DATA_LO */
256 cs->buf[cs->cdw++] = 0; /* DATA_HI */
257 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
258 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
259 }
260
r600_query_read_result(char * map,unsigned start_index,unsigned end_index,bool test_status_bit)261 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
262 bool test_status_bit)
263 {
264 uint32_t *current_result = (uint32_t*)map;
265 uint64_t start, end;
266
267 start = (uint64_t)current_result[start_index] |
268 (uint64_t)current_result[start_index+1] << 32;
269 end = (uint64_t)current_result[end_index] |
270 (uint64_t)current_result[end_index+1] << 32;
271
272 if (!test_status_bit ||
273 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
274 return end - start;
275 }
276 return 0;
277 }
278
r600_query_result(struct r600_context * ctx,struct r600_query * query,boolean wait)279 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
280 {
281 unsigned results_base = query->results_start;
282 char *map;
283
284 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
285 PIPE_TRANSFER_READ |
286 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
287 if (!map)
288 return FALSE;
289
290 /* count all results across all data blocks */
291 switch (query->type) {
292 case PIPE_QUERY_OCCLUSION_COUNTER:
293 while (results_base != query->results_end) {
294 query->result.u64 +=
295 r600_query_read_result(map + results_base, 0, 2, true);
296 results_base = (results_base + 16) % query->buffer->b.b.width0;
297 }
298 break;
299 case PIPE_QUERY_OCCLUSION_PREDICATE:
300 while (results_base != query->results_end) {
301 query->result.b = query->result.b ||
302 r600_query_read_result(map + results_base, 0, 2, true) != 0;
303 results_base = (results_base + 16) % query->buffer->b.b.width0;
304 }
305 break;
306 case PIPE_QUERY_TIME_ELAPSED:
307 while (results_base != query->results_end) {
308 query->result.u64 +=
309 r600_query_read_result(map + results_base, 0, 2, false);
310 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
311 }
312 break;
313 case PIPE_QUERY_PRIMITIVES_EMITTED:
314 /* SAMPLE_STREAMOUTSTATS stores this structure:
315 * {
316 * u64 NumPrimitivesWritten;
317 * u64 PrimitiveStorageNeeded;
318 * }
319 * We only need NumPrimitivesWritten here. */
320 while (results_base != query->results_end) {
321 query->result.u64 +=
322 r600_query_read_result(map + results_base, 2, 6, true);
323 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
324 }
325 break;
326 case PIPE_QUERY_PRIMITIVES_GENERATED:
327 /* Here we read PrimitiveStorageNeeded. */
328 while (results_base != query->results_end) {
329 query->result.u64 +=
330 r600_query_read_result(map + results_base, 0, 4, true);
331 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
332 }
333 break;
334 case PIPE_QUERY_SO_STATISTICS:
335 while (results_base != query->results_end) {
336 query->result.so.num_primitives_written +=
337 r600_query_read_result(map + results_base, 2, 6, true);
338 query->result.so.primitives_storage_needed +=
339 r600_query_read_result(map + results_base, 0, 4, true);
340 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
341 }
342 break;
343 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
344 while (results_base != query->results_end) {
345 query->result.b = query->result.b ||
346 r600_query_read_result(map + results_base, 2, 6, true) !=
347 r600_query_read_result(map + results_base, 0, 4, true);
348 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
349 }
350 break;
351 default:
352 assert(0);
353 }
354
355 query->results_start = query->results_end;
356 ctx->ws->buffer_unmap(query->buffer->cs_buf);
357 return TRUE;
358 }
359
r600_query_begin(struct r600_context * ctx,struct r600_query * query)360 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
361 {
362 struct radeon_winsys_cs *cs = ctx->cs;
363 unsigned new_results_end, i;
364 uint32_t *results;
365 uint64_t va;
366
367 si_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
368
369 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
370
371 /* collect current results if query buffer is full */
372 if (new_results_end == query->results_start) {
373 r600_query_result(ctx, query, TRUE);
374 }
375
376 switch (query->type) {
377 case PIPE_QUERY_OCCLUSION_COUNTER:
378 case PIPE_QUERY_OCCLUSION_PREDICATE:
379 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
380 if (results) {
381 results = (uint32_t*)((char*)results + query->results_end);
382 memset(results, 0, query->result_size);
383
384 /* Set top bits for unused backends */
385 for (i = 0; i < ctx->max_db; i++) {
386 if (!(ctx->backend_mask & (1<<i))) {
387 results[(i * 4)+1] = 0x80000000;
388 results[(i * 4)+3] = 0x80000000;
389 }
390 }
391 ctx->ws->buffer_unmap(query->buffer->cs_buf);
392 }
393 break;
394 case PIPE_QUERY_TIME_ELAPSED:
395 break;
396 case PIPE_QUERY_PRIMITIVES_EMITTED:
397 case PIPE_QUERY_PRIMITIVES_GENERATED:
398 case PIPE_QUERY_SO_STATISTICS:
399 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
400 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
401 results = (uint32_t*)((char*)results + query->results_end);
402 memset(results, 0, query->result_size);
403 ctx->ws->buffer_unmap(query->buffer->cs_buf);
404 break;
405 default:
406 assert(0);
407 }
408
409 /* emit begin query */
410 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
411 va += query->results_end;
412
413 switch (query->type) {
414 case PIPE_QUERY_OCCLUSION_COUNTER:
415 case PIPE_QUERY_OCCLUSION_PREDICATE:
416 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
417 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
418 cs->buf[cs->cdw++] = va;
419 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
420 break;
421 case PIPE_QUERY_PRIMITIVES_EMITTED:
422 case PIPE_QUERY_PRIMITIVES_GENERATED:
423 case PIPE_QUERY_SO_STATISTICS:
424 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
425 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
426 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
427 cs->buf[cs->cdw++] = query->results_end;
428 cs->buf[cs->cdw++] = 0;
429 break;
430 case PIPE_QUERY_TIME_ELAPSED:
431 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
432 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
433 cs->buf[cs->cdw++] = va;
434 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
435 cs->buf[cs->cdw++] = 0;
436 cs->buf[cs->cdw++] = 0;
437 break;
438 default:
439 assert(0);
440 }
441 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
442 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
443
444 ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
445 }
446
r600_query_end(struct r600_context * ctx,struct r600_query * query)447 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
448 {
449 struct radeon_winsys_cs *cs = ctx->cs;
450 uint64_t va;
451
452 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
453 /* emit end query */
454 switch (query->type) {
455 case PIPE_QUERY_OCCLUSION_COUNTER:
456 case PIPE_QUERY_OCCLUSION_PREDICATE:
457 va += query->results_end + 8;
458 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
459 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
460 cs->buf[cs->cdw++] = va;
461 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
462 break;
463 case PIPE_QUERY_PRIMITIVES_EMITTED:
464 case PIPE_QUERY_PRIMITIVES_GENERATED:
465 case PIPE_QUERY_SO_STATISTICS:
466 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
467 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
468 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
469 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
470 cs->buf[cs->cdw++] = 0;
471 break;
472 case PIPE_QUERY_TIME_ELAPSED:
473 va += query->results_end + query->result_size/2;
474 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
475 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
476 cs->buf[cs->cdw++] = va;
477 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
478 cs->buf[cs->cdw++] = 0;
479 cs->buf[cs->cdw++] = 0;
480 break;
481 default:
482 assert(0);
483 }
484 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
485 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
486
487 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
488 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
489 }
490
r600_query_predication(struct r600_context * ctx,struct r600_query * query,int operation,int flag_wait)491 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
492 int flag_wait)
493 {
494 struct radeon_winsys_cs *cs = ctx->cs;
495 uint64_t va;
496
497 if (operation == PREDICATION_OP_CLEAR) {
498 si_need_cs_space(ctx, 3, FALSE);
499
500 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
501 cs->buf[cs->cdw++] = 0;
502 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
503 } else {
504 unsigned results_base = query->results_start;
505 unsigned count;
506 uint32_t op;
507
508 /* find count of the query data blocks */
509 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
510 count /= query->result_size;
511
512 si_need_cs_space(ctx, 5 * count, TRUE);
513
514 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
515 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
516 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
517
518 /* emit predicate packets for all data blocks */
519 while (results_base != query->results_end) {
520 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
521 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
522 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
523 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
524 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
525 RADEON_USAGE_READ);
526 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
527
528 /* set CONTINUE bit for all packets except the first */
529 op |= PREDICATION_CONTINUE;
530 }
531 }
532 }
533
r600_context_query_create(struct r600_context * ctx,unsigned query_type)534 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
535 {
536 struct r600_query *query;
537 unsigned buffer_size = 4096;
538
539 query = CALLOC_STRUCT(r600_query);
540 if (query == NULL)
541 return NULL;
542
543 query->type = query_type;
544
545 switch (query_type) {
546 case PIPE_QUERY_OCCLUSION_COUNTER:
547 case PIPE_QUERY_OCCLUSION_PREDICATE:
548 query->result_size = 16 * ctx->max_db;
549 query->num_cs_dw = 6;
550 break;
551 case PIPE_QUERY_TIME_ELAPSED:
552 query->result_size = 16;
553 query->num_cs_dw = 8;
554 break;
555 case PIPE_QUERY_PRIMITIVES_EMITTED:
556 case PIPE_QUERY_PRIMITIVES_GENERATED:
557 case PIPE_QUERY_SO_STATISTICS:
558 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
559 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
560 query->result_size = 32;
561 query->num_cs_dw = 6;
562 break;
563 default:
564 assert(0);
565 FREE(query);
566 return NULL;
567 }
568
569 /* adjust buffer size to simplify offsets wrapping math */
570 buffer_size -= buffer_size % query->result_size;
571
572 /* Queries are normally read by the CPU after
573 * being written by the gpu, hence staging is probably a good
574 * usage pattern.
575 */
576 query->buffer = si_resource_create_custom(&ctx->screen->screen,
577 PIPE_USAGE_STAGING,
578 buffer_size);
579 if (!query->buffer) {
580 FREE(query);
581 return NULL;
582 }
583 return query;
584 }
585
r600_context_query_destroy(struct r600_context * ctx,struct r600_query * query)586 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
587 {
588 si_resource_reference(&query->buffer, NULL);
589 free(query);
590 }
591
r600_context_query_result(struct r600_context * ctx,struct r600_query * query,boolean wait,void * vresult)592 boolean r600_context_query_result(struct r600_context *ctx,
593 struct r600_query *query,
594 boolean wait, void *vresult)
595 {
596 boolean *result_b = (boolean*)vresult;
597 uint64_t *result_u64 = (uint64_t*)vresult;
598 struct pipe_query_data_so_statistics *result_so =
599 (struct pipe_query_data_so_statistics*)vresult;
600
601 if (!r600_query_result(ctx, query, wait))
602 return FALSE;
603
604 switch (query->type) {
605 case PIPE_QUERY_OCCLUSION_COUNTER:
606 case PIPE_QUERY_PRIMITIVES_EMITTED:
607 case PIPE_QUERY_PRIMITIVES_GENERATED:
608 *result_u64 = query->result.u64;
609 break;
610 case PIPE_QUERY_OCCLUSION_PREDICATE:
611 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
612 *result_b = query->result.b;
613 break;
614 case PIPE_QUERY_TIME_ELAPSED:
615 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
616 break;
617 case PIPE_QUERY_SO_STATISTICS:
618 *result_so = query->result.so;
619 break;
620 default:
621 assert(0);
622 }
623 return TRUE;
624 }
625
r600_context_queries_suspend(struct r600_context * ctx)626 void r600_context_queries_suspend(struct r600_context *ctx)
627 {
628 struct r600_query *query;
629
630 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
631 r600_query_end(ctx, query);
632 }
633 assert(ctx->num_cs_dw_queries_suspend == 0);
634 }
635
r600_context_queries_resume(struct r600_context * ctx)636 void r600_context_queries_resume(struct r600_context *ctx)
637 {
638 struct r600_query *query;
639
640 assert(ctx->num_cs_dw_queries_suspend == 0);
641
642 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
643 r600_query_begin(ctx, query);
644 }
645 }
646
r600_context_draw_opaque_count(struct r600_context * ctx,struct r600_so_target * t)647 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
648 {
649 struct radeon_winsys_cs *cs = ctx->cs;
650 si_need_cs_space(ctx, 14 + 21, TRUE);
651
652 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
653 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
654 cs->buf[cs->cdw++] = 0;
655
656 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
657 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
658 cs->buf[cs->cdw++] = t->stride >> 2;
659
660 #if 0
661 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
662 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
663 cs->buf[cs->cdw++] = 0; /* src address lo */
664 cs->buf[cs->cdw++] = 0; /* src address hi */
665 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
666 cs->buf[cs->cdw++] = 0; /* unused */
667 #endif
668
669 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
670 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
671
672 #if 0 /* I have not found this useful yet. */
673 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
674 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
675 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
676 cs->buf[cs->cdw++] = 0; /* unused */
677 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
678 cs->buf[cs->cdw++] = 0; /* unused */
679
680 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
681 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
682 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
683
684 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
685 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
686 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
687
688 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
689 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
690 RADEON_USAGE_WRITE);
691
692 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
693 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
694 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */
695 cs->buf[cs->cdw++] = 0;
696 cs->buf[cs->cdw++] = 0; /* reference value */
697 cs->buf[cs->cdw++] = 0xffffffff; /* mask */
698 cs->buf[cs->cdw++] = 4; /* poll interval */
699 #endif
700 }
701