1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file iris_query.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * Query object support.  This allows measuring various simple statistics
31  * via counters on the GPU.  We use GenX code for MI_MATH calculations.
32  */
33 
34 #include <stdio.h>
35 #include <errno.h>
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_context.h"
39 #include "pipe/p_screen.h"
40 #include "util/u_inlines.h"
41 #include "util/u_upload_mgr.h"
42 #include "iris_context.h"
43 #include "iris_defines.h"
44 #include "iris_fence.h"
45 #include "iris_monitor.h"
46 #include "iris_resource.h"
47 #include "iris_screen.h"
48 
49 #include "iris_genx_macros.h"
50 
51 #define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8)
52 #define SO_NUM_PRIMS_WRITTEN(n)   (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8)
53 
54 struct iris_query {
55    enum pipe_query_type type;
56    int index;
57 
58    bool ready;
59 
60    bool stalled;
61 
62    uint64_t result;
63 
64    struct iris_state_ref query_state_ref;
65    struct iris_query_snapshots *map;
66    struct iris_syncobj *syncobj;
67 
68    int batch_idx;
69 
70    struct iris_monitor_object *monitor;
71 
72    /* Fence for PIPE_QUERY_GPU_FINISHED. */
73    struct pipe_fence_handle *fence;
74 };
75 
76 struct iris_query_snapshots {
77    /** iris_render_condition's saved MI_PREDICATE_RESULT value. */
78    uint64_t predicate_result;
79 
80    /** Have the start/end snapshots landed? */
81    uint64_t snapshots_landed;
82 
83    /** Starting and ending counter snapshots */
84    uint64_t start;
85    uint64_t end;
86 };
87 
88 struct iris_query_so_overflow {
89    uint64_t predicate_result;
90    uint64_t snapshots_landed;
91 
92    struct {
93       uint64_t prim_storage_needed[2];
94       uint64_t num_prims[2];
95    } stream[4];
96 };
97 
98 static struct gen_mi_value
query_mem64(struct iris_query * q,uint32_t offset)99 query_mem64(struct iris_query *q, uint32_t offset)
100 {
101    struct iris_address addr = {
102       .bo = iris_resource_bo(q->query_state_ref.res),
103       .offset = q->query_state_ref.offset + offset,
104       .access = IRIS_DOMAIN_OTHER_WRITE
105    };
106    return gen_mi_mem64(addr);
107 }
108 
109 /**
110  * Is this type of query written by PIPE_CONTROL?
111  */
112 static bool
iris_is_query_pipelined(struct iris_query * q)113 iris_is_query_pipelined(struct iris_query *q)
114 {
115    switch (q->type) {
116    case PIPE_QUERY_OCCLUSION_COUNTER:
117    case PIPE_QUERY_OCCLUSION_PREDICATE:
118    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
119    case PIPE_QUERY_TIMESTAMP:
120    case PIPE_QUERY_TIMESTAMP_DISJOINT:
121    case PIPE_QUERY_TIME_ELAPSED:
122       return true;
123 
124    default:
125       return false;
126    }
127 }
128 
129 static void
mark_available(struct iris_context * ice,struct iris_query * q)130 mark_available(struct iris_context *ice, struct iris_query *q)
131 {
132    struct iris_batch *batch = &ice->batches[q->batch_idx];
133    unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
134    unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed);
135    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
136    offset += q->query_state_ref.offset;
137 
138    if (!iris_is_query_pipelined(q)) {
139       batch->screen->vtbl.store_data_imm64(batch, bo, offset, true);
140    } else {
141       /* Order available *after* the query results. */
142       flags |= PIPE_CONTROL_FLUSH_ENABLE;
143       iris_emit_pipe_control_write(batch, "query: mark available",
144                                    flags, bo, offset, true);
145    }
146 }
147 
148 /**
149  * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
150  */
151 static void
iris_pipelined_write(struct iris_batch * batch,struct iris_query * q,enum pipe_control_flags flags,unsigned offset)152 iris_pipelined_write(struct iris_batch *batch,
153                      struct iris_query *q,
154                      enum pipe_control_flags flags,
155                      unsigned offset)
156 {
157    const struct gen_device_info *devinfo = &batch->screen->devinfo;
158    const unsigned optional_cs_stall =
159       GEN_GEN == 9 && devinfo->gt == 4 ?  PIPE_CONTROL_CS_STALL : 0;
160    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
161 
162    iris_emit_pipe_control_write(batch, "query: pipelined snapshot write",
163                                 flags | optional_cs_stall,
164                                 bo, offset, 0ull);
165 }
166 
167 static void
write_value(struct iris_context * ice,struct iris_query * q,unsigned offset)168 write_value(struct iris_context *ice, struct iris_query *q, unsigned offset)
169 {
170    struct iris_batch *batch = &ice->batches[q->batch_idx];
171    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
172 
173    if (!iris_is_query_pipelined(q)) {
174       iris_emit_pipe_control_flush(batch,
175                                    "query: non-pipelined snapshot write",
176                                    PIPE_CONTROL_CS_STALL |
177                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
178       q->stalled = true;
179    }
180 
181    switch (q->type) {
182    case PIPE_QUERY_OCCLUSION_COUNTER:
183    case PIPE_QUERY_OCCLUSION_PREDICATE:
184    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
185       if (GEN_GEN >= 10) {
186          /* "Driver must program PIPE_CONTROL with only Depth Stall Enable
187           *  bit set prior to programming a PIPE_CONTROL with Write PS Depth
188           *  Count sync operation."
189           */
190          iris_emit_pipe_control_flush(batch,
191                                       "workaround: depth stall before writing "
192                                       "PS_DEPTH_COUNT",
193                                       PIPE_CONTROL_DEPTH_STALL);
194       }
195       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
196                            PIPE_CONTROL_WRITE_DEPTH_COUNT |
197                            PIPE_CONTROL_DEPTH_STALL,
198                            offset);
199       break;
200    case PIPE_QUERY_TIME_ELAPSED:
201    case PIPE_QUERY_TIMESTAMP:
202    case PIPE_QUERY_TIMESTAMP_DISJOINT:
203       iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q,
204                            PIPE_CONTROL_WRITE_TIMESTAMP,
205                            offset);
206       break;
207    case PIPE_QUERY_PRIMITIVES_GENERATED:
208       batch->screen->vtbl.store_register_mem64(batch,
209                                      q->index == 0 ?
210                                      GENX(CL_INVOCATION_COUNT_num) :
211                                      SO_PRIM_STORAGE_NEEDED(q->index),
212                                      bo, offset, false);
213       break;
214    case PIPE_QUERY_PRIMITIVES_EMITTED:
215       batch->screen->vtbl.store_register_mem64(batch,
216                                      SO_NUM_PRIMS_WRITTEN(q->index),
217                                      bo, offset, false);
218       break;
219    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
220       static const uint32_t index_to_reg[] = {
221          GENX(IA_VERTICES_COUNT_num),
222          GENX(IA_PRIMITIVES_COUNT_num),
223          GENX(VS_INVOCATION_COUNT_num),
224          GENX(GS_INVOCATION_COUNT_num),
225          GENX(GS_PRIMITIVES_COUNT_num),
226          GENX(CL_INVOCATION_COUNT_num),
227          GENX(CL_PRIMITIVES_COUNT_num),
228          GENX(PS_INVOCATION_COUNT_num),
229          GENX(HS_INVOCATION_COUNT_num),
230          GENX(DS_INVOCATION_COUNT_num),
231          GENX(CS_INVOCATION_COUNT_num),
232       };
233       const uint32_t reg = index_to_reg[q->index];
234 
235       batch->screen->vtbl.store_register_mem64(batch, reg, bo, offset, false);
236       break;
237    }
238    default:
239       assert(false);
240    }
241 }
242 
243 static void
write_overflow_values(struct iris_context * ice,struct iris_query * q,bool end)244 write_overflow_values(struct iris_context *ice, struct iris_query *q, bool end)
245 {
246    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
247    uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
248    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
249    uint32_t offset = q->query_state_ref.offset;
250 
251    iris_emit_pipe_control_flush(batch,
252                                 "query: write SO overflow snapshots",
253                                 PIPE_CONTROL_CS_STALL |
254                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
255    for (uint32_t i = 0; i < count; i++) {
256       int s = q->index + i;
257       int g_idx = offset + offsetof(struct iris_query_so_overflow,
258                            stream[s].num_prims[end]);
259       int w_idx = offset + offsetof(struct iris_query_so_overflow,
260                            stream[s].prim_storage_needed[end]);
261       batch->screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
262                                      bo, g_idx, false);
263       batch->screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
264                                      bo, w_idx, false);
265    }
266 }
267 
268 static uint64_t
iris_raw_timestamp_delta(uint64_t time0,uint64_t time1)269 iris_raw_timestamp_delta(uint64_t time0, uint64_t time1)
270 {
271    if (time0 > time1) {
272       return (1ULL << TIMESTAMP_BITS) + time1 - time0;
273    } else {
274       return time1 - time0;
275    }
276 }
277 
278 static bool
stream_overflowed(struct iris_query_so_overflow * so,int s)279 stream_overflowed(struct iris_query_so_overflow *so, int s)
280 {
281    return (so->stream[s].prim_storage_needed[1] -
282            so->stream[s].prim_storage_needed[0]) !=
283           (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
284 }
285 
286 static void
calculate_result_on_cpu(const struct gen_device_info * devinfo,struct iris_query * q)287 calculate_result_on_cpu(const struct gen_device_info *devinfo,
288                         struct iris_query *q)
289 {
290    switch (q->type) {
291    case PIPE_QUERY_OCCLUSION_PREDICATE:
292    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
293       q->result = q->map->end != q->map->start;
294       break;
295    case PIPE_QUERY_TIMESTAMP:
296    case PIPE_QUERY_TIMESTAMP_DISJOINT:
297       /* The timestamp is the single starting snapshot. */
298       q->result = gen_device_info_timebase_scale(devinfo, q->map->start);
299       q->result &= (1ull << TIMESTAMP_BITS) - 1;
300       break;
301    case PIPE_QUERY_TIME_ELAPSED:
302       q->result = iris_raw_timestamp_delta(q->map->start, q->map->end);
303       q->result = gen_device_info_timebase_scale(devinfo, q->result);
304       q->result &= (1ull << TIMESTAMP_BITS) - 1;
305       break;
306    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
307       q->result = stream_overflowed((void *) q->map, q->index);
308       break;
309    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
310       q->result = false;
311       for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
312          q->result |= stream_overflowed((void *) q->map, i);
313       break;
314    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
315       q->result = q->map->end - q->map->start;
316 
317       /* WaDividePSInvocationCountBy4:HSW,BDW */
318       if (GEN_GEN == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
319          q->result /= 4;
320       break;
321    case PIPE_QUERY_OCCLUSION_COUNTER:
322    case PIPE_QUERY_PRIMITIVES_GENERATED:
323    case PIPE_QUERY_PRIMITIVES_EMITTED:
324    default:
325       q->result = q->map->end - q->map->start;
326       break;
327    }
328 
329    q->ready = true;
330 }
331 
332 /**
333  * Calculate the streamout overflow for stream \p idx:
334  *
335  * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0])
336  */
337 static struct gen_mi_value
calc_overflow_for_stream(struct gen_mi_builder * b,struct iris_query * q,int idx)338 calc_overflow_for_stream(struct gen_mi_builder *b,
339                          struct iris_query *q,
340                          int idx)
341 {
342 #define C(counter, i) query_mem64(q, \
343    offsetof(struct iris_query_so_overflow, stream[idx].counter[i]))
344 
345    return gen_mi_isub(b, gen_mi_isub(b, C(num_prims, 1), C(num_prims, 0)),
346                          gen_mi_isub(b, C(prim_storage_needed, 1),
347                                         C(prim_storage_needed, 0)));
348 #undef C
349 }
350 
351 /**
352  * Calculate whether any stream has overflowed.
353  */
354 static struct gen_mi_value
calc_overflow_any_stream(struct gen_mi_builder * b,struct iris_query * q)355 calc_overflow_any_stream(struct gen_mi_builder *b, struct iris_query *q)
356 {
357    struct gen_mi_value stream_result[MAX_VERTEX_STREAMS];
358    for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
359       stream_result[i] = calc_overflow_for_stream(b, q, i);
360 
361    struct gen_mi_value result = stream_result[0];
362    for (int i = 1; i < MAX_VERTEX_STREAMS; i++)
363       result = gen_mi_ior(b, result, stream_result[i]);
364 
365    return result;
366 }
367 
368 static bool
query_is_boolean(enum pipe_query_type type)369 query_is_boolean(enum pipe_query_type type)
370 {
371    switch (type) {
372    case PIPE_QUERY_OCCLUSION_PREDICATE:
373    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
374    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
375    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
376       return true;
377    default:
378       return false;
379    }
380 }
381 
382 /**
383  * Calculate the result using MI_MATH.
384  */
385 static struct gen_mi_value
calculate_result_on_gpu(const struct gen_device_info * devinfo,struct gen_mi_builder * b,struct iris_query * q)386 calculate_result_on_gpu(const struct gen_device_info *devinfo,
387                         struct gen_mi_builder *b,
388                         struct iris_query *q)
389 {
390    struct gen_mi_value result;
391    struct gen_mi_value start_val =
392       query_mem64(q, offsetof(struct iris_query_snapshots, start));
393    struct gen_mi_value end_val =
394       query_mem64(q, offsetof(struct iris_query_snapshots, end));
395 
396    switch (q->type) {
397    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
398       result = calc_overflow_for_stream(b, q, q->index);
399       break;
400    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
401       result = calc_overflow_any_stream(b, q);
402       break;
403    case PIPE_QUERY_TIMESTAMP: {
404       /* TODO: This discards any fractional bits of the timebase scale.
405        * We would need to do a bit of fixed point math on the CS ALU, or
406        * launch an actual shader to calculate this with full precision.
407        */
408       uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
409       result = gen_mi_iand(b, gen_mi_imm((1ull << 36) - 1),
410                            gen_mi_imul_imm(b, start_val, scale));
411       break;
412    }
413    case PIPE_QUERY_TIME_ELAPSED: {
414       /* TODO: This discards fractional bits (see above). */
415       uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
416       result = gen_mi_imul_imm(b, gen_mi_isub(b, end_val, start_val), scale);
417       break;
418    }
419    default:
420       result = gen_mi_isub(b, end_val, start_val);
421       break;
422    }
423 
424    /* WaDividePSInvocationCountBy4:HSW,BDW */
425    if (GEN_GEN == 8 &&
426        q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
427        q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
428       result = gen_mi_ushr32_imm(b, result, 2);
429 
430    if (query_is_boolean(q->type))
431       result = gen_mi_iand(b, gen_mi_nz(b, result), gen_mi_imm(1));
432 
433    return result;
434 }
435 
436 static struct pipe_query *
iris_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)437 iris_create_query(struct pipe_context *ctx,
438                   unsigned query_type,
439                   unsigned index)
440 {
441    struct iris_query *q = calloc(1, sizeof(struct iris_query));
442 
443    q->type = query_type;
444    q->index = index;
445    q->monitor = NULL;
446 
447    if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
448        q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
449       q->batch_idx = IRIS_BATCH_COMPUTE;
450    else
451       q->batch_idx = IRIS_BATCH_RENDER;
452    return (struct pipe_query *) q;
453 }
454 
455 static struct pipe_query *
iris_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)456 iris_create_batch_query(struct pipe_context *ctx,
457                         unsigned num_queries,
458                         unsigned *query_types)
459 {
460    struct iris_context *ice = (void *) ctx;
461    struct iris_query *q = calloc(1, sizeof(struct iris_query));
462    if (unlikely(!q))
463       return NULL;
464    q->type = PIPE_QUERY_DRIVER_SPECIFIC;
465    q->index = -1;
466    q->monitor = iris_create_monitor_object(ice, num_queries, query_types);
467    if (unlikely(!q->monitor)) {
468       free(q);
469       return NULL;
470    }
471 
472    return (struct pipe_query *) q;
473 }
474 
475 static void
iris_destroy_query(struct pipe_context * ctx,struct pipe_query * p_query)476 iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
477 {
478    struct iris_query *query = (void *) p_query;
479    struct iris_screen *screen = (void *) ctx->screen;
480    if (query->monitor) {
481       iris_destroy_monitor_object(ctx, query->monitor);
482       query->monitor = NULL;
483    } else {
484       iris_syncobj_reference(screen, &query->syncobj, NULL);
485       screen->base.fence_reference(ctx->screen, &query->fence, NULL);
486    }
487    pipe_resource_reference(&query->query_state_ref.res, NULL);
488    free(query);
489 }
490 
491 
492 static bool
iris_begin_query(struct pipe_context * ctx,struct pipe_query * query)493 iris_begin_query(struct pipe_context *ctx, struct pipe_query *query)
494 {
495    struct iris_context *ice = (void *) ctx;
496    struct iris_query *q = (void *) query;
497 
498    if (q->monitor)
499       return iris_begin_monitor(ctx, q->monitor);
500 
501    void *ptr = NULL;
502    uint32_t size;
503 
504    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
505        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
506       size = sizeof(struct iris_query_so_overflow);
507    else
508       size = sizeof(struct iris_query_snapshots);
509 
510    u_upload_alloc(ice->query_buffer_uploader, 0,
511                   size, size, &q->query_state_ref.offset,
512                   &q->query_state_ref.res, &ptr);
513 
514    if (!iris_resource_bo(q->query_state_ref.res))
515       return false;
516 
517    q->map = ptr;
518    if (!q->map)
519       return false;
520 
521    q->result = 0ull;
522    q->ready = false;
523    WRITE_ONCE(q->map->snapshots_landed, false);
524 
525    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
526       ice->state.prims_generated_query_active = true;
527       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
528    }
529 
530    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
531        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
532       write_overflow_values(ice, q, false);
533    else
534       write_value(ice, q,
535                   q->query_state_ref.offset +
536                   offsetof(struct iris_query_snapshots, start));
537 
538    return true;
539 }
540 
541 static bool
iris_end_query(struct pipe_context * ctx,struct pipe_query * query)542 iris_end_query(struct pipe_context *ctx, struct pipe_query *query)
543 {
544    struct iris_context *ice = (void *) ctx;
545    struct iris_query *q = (void *) query;
546 
547    if (q->monitor)
548       return iris_end_monitor(ctx, q->monitor);
549 
550    if (q->type == PIPE_QUERY_GPU_FINISHED) {
551       ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED);
552       return true;
553    }
554 
555    struct iris_batch *batch = &ice->batches[q->batch_idx];
556 
557    if (q->type == PIPE_QUERY_TIMESTAMP) {
558       iris_begin_query(ctx, query);
559       iris_batch_reference_signal_syncobj(batch, &q->syncobj);
560       mark_available(ice, q);
561       return true;
562    }
563 
564    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
565       ice->state.prims_generated_query_active = false;
566       ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
567    }
568 
569    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
570        q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
571       write_overflow_values(ice, q, true);
572    else
573       write_value(ice, q,
574                   q->query_state_ref.offset +
575                   offsetof(struct iris_query_snapshots, end));
576 
577    iris_batch_reference_signal_syncobj(batch, &q->syncobj);
578    mark_available(ice, q);
579 
580    return true;
581 }
582 
583 /**
584  * See if the snapshots have landed for a query, and if so, compute the
585  * result and mark it ready.  Does not flush (unlike iris_get_query_result).
586  */
587 static void
iris_check_query_no_flush(struct iris_context * ice,struct iris_query * q)588 iris_check_query_no_flush(struct iris_context *ice, struct iris_query *q)
589 {
590    struct iris_screen *screen = (void *) ice->ctx.screen;
591    const struct gen_device_info *devinfo = &screen->devinfo;
592 
593    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
594       calculate_result_on_cpu(devinfo, q);
595    }
596 }
597 
598 static bool
iris_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)599 iris_get_query_result(struct pipe_context *ctx,
600                       struct pipe_query *query,
601                       bool wait,
602                       union pipe_query_result *result)
603 {
604    struct iris_context *ice = (void *) ctx;
605    struct iris_query *q = (void *) query;
606 
607    if (q->monitor)
608       return iris_get_monitor_result(ctx, q->monitor, wait, result->batch);
609 
610    struct iris_screen *screen = (void *) ctx->screen;
611    const struct gen_device_info *devinfo = &screen->devinfo;
612 
613    if (unlikely(screen->no_hw)) {
614       result->u64 = 0;
615       return true;
616    }
617 
618    if (q->type == PIPE_QUERY_GPU_FINISHED) {
619       struct pipe_screen *screen = ctx->screen;
620 
621       result->b = screen->fence_finish(screen, ctx, q->fence,
622                                        wait ? PIPE_TIMEOUT_INFINITE : 0);
623       return result->b;
624    }
625 
626    if (!q->ready) {
627       struct iris_batch *batch = &ice->batches[q->batch_idx];
628       if (q->syncobj == iris_batch_get_signal_syncobj(batch))
629          iris_batch_flush(batch);
630 
631       while (!READ_ONCE(q->map->snapshots_landed)) {
632          if (wait)
633             iris_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
634          else
635             return false;
636       }
637 
638       assert(READ_ONCE(q->map->snapshots_landed));
639       calculate_result_on_cpu(devinfo, q);
640    }
641 
642    assert(q->ready);
643 
644    result->u64 = q->result;
645 
646    return true;
647 }
648 
649 static void
iris_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * p_res,unsigned offset)650 iris_get_query_result_resource(struct pipe_context *ctx,
651                                struct pipe_query *query,
652                                bool wait,
653                                enum pipe_query_value_type result_type,
654                                int index,
655                                struct pipe_resource *p_res,
656                                unsigned offset)
657 {
658    struct iris_context *ice = (void *) ctx;
659    struct iris_query *q = (void *) query;
660    struct iris_batch *batch = &ice->batches[q->batch_idx];
661    const struct gen_device_info *devinfo = &batch->screen->devinfo;
662    struct iris_resource *res = (void *) p_res;
663    struct iris_bo *query_bo = iris_resource_bo(q->query_state_ref.res);
664    struct iris_bo *dst_bo = iris_resource_bo(p_res);
665    unsigned snapshots_landed_offset =
666       offsetof(struct iris_query_snapshots, snapshots_landed);
667 
668    res->bind_history |= PIPE_BIND_QUERY_BUFFER;
669 
670    if (index == -1) {
671       /* They're asking for the availability of the result.  If we still
672        * have commands queued up which produce the result, submit them
673        * now so that progress happens.  Either way, copy the snapshots
674        * landed field to the destination resource.
675        */
676       if (q->syncobj == iris_batch_get_signal_syncobj(batch))
677          iris_batch_flush(batch);
678 
679       batch->screen->vtbl.copy_mem_mem(batch, dst_bo, offset,
680                              query_bo, snapshots_landed_offset,
681                              result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
682       return;
683    }
684 
685    if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
686       /* The final snapshots happen to have landed, so let's just compute
687        * the result on the CPU now...
688        */
689       calculate_result_on_cpu(devinfo, q);
690    }
691 
692    if (q->ready) {
693       /* We happen to have the result on the CPU, so just copy it. */
694       if (result_type <= PIPE_QUERY_TYPE_U32) {
695          batch->screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result);
696       } else {
697          batch->screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result);
698       }
699 
700       /* Make sure the result lands before they use bind the QBO elsewhere
701        * and use the result.
702        */
703       // XXX: Why?  i965 doesn't do this.
704       iris_emit_pipe_control_flush(batch,
705                                    "query: unknown QBO flushing hack",
706                                    PIPE_CONTROL_CS_STALL);
707       return;
708    }
709 
710    bool predicated = !wait && !q->stalled;
711 
712    struct gen_mi_builder b;
713    gen_mi_builder_init(&b, batch);
714 
715    iris_batch_sync_region_start(batch);
716 
717    struct gen_mi_value result = calculate_result_on_gpu(devinfo, &b, q);
718    struct gen_mi_value dst =
719       result_type <= PIPE_QUERY_TYPE_U32 ?
720       gen_mi_mem32(rw_bo(dst_bo, offset, IRIS_DOMAIN_OTHER_WRITE)) :
721       gen_mi_mem64(rw_bo(dst_bo, offset, IRIS_DOMAIN_OTHER_WRITE));
722 
723    if (predicated) {
724       gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_RESULT),
725                    gen_mi_mem64(ro_bo(query_bo, snapshots_landed_offset)));
726       gen_mi_store_if(&b, dst, result);
727    } else {
728       gen_mi_store(&b, dst, result);
729    }
730 
731    iris_batch_sync_region_end(batch);
732 }
733 
734 static void
iris_set_active_query_state(struct pipe_context * ctx,bool enable)735 iris_set_active_query_state(struct pipe_context *ctx, bool enable)
736 {
737    struct iris_context *ice = (void *) ctx;
738 
739    if (ice->state.statistics_counters_enabled == enable)
740       return;
741 
742    // XXX: most packets aren't paying attention to this yet, because it'd
743    // have to be done dynamically at draw time, which is a pain
744    ice->state.statistics_counters_enabled = enable;
745    ice->state.dirty |= IRIS_DIRTY_CLIP |
746                        IRIS_DIRTY_RASTER |
747                        IRIS_DIRTY_STREAMOUT |
748                        IRIS_DIRTY_WM;
749    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_GS |
750                              IRIS_STAGE_DIRTY_TCS |
751                              IRIS_STAGE_DIRTY_TES |
752                              IRIS_STAGE_DIRTY_VS;
753 }
754 
755 static void
set_predicate_enable(struct iris_context * ice,bool value)756 set_predicate_enable(struct iris_context *ice, bool value)
757 {
758    if (value)
759       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
760    else
761       ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER;
762 }
763 
764 static void
set_predicate_for_result(struct iris_context * ice,struct iris_query * q,bool inverted)765 set_predicate_for_result(struct iris_context *ice,
766                          struct iris_query *q,
767                          bool inverted)
768 {
769    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
770    struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res);
771 
772    iris_batch_sync_region_start(batch);
773 
774    /* The CPU doesn't have the query result yet; use hardware predication */
775    ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT;
776 
777    /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
778    iris_emit_pipe_control_flush(batch,
779                                 "conditional rendering: set predicate",
780                                 PIPE_CONTROL_FLUSH_ENABLE);
781    q->stalled = true;
782 
783    struct gen_mi_builder b;
784    gen_mi_builder_init(&b, batch);
785 
786    struct gen_mi_value result;
787 
788    switch (q->type) {
789    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
790       result = calc_overflow_for_stream(&b, q, q->index);
791       break;
792    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
793       result = calc_overflow_any_stream(&b, q);
794       break;
795    default: {
796       /* PIPE_QUERY_OCCLUSION_* */
797       struct gen_mi_value start =
798          query_mem64(q, offsetof(struct iris_query_snapshots, start));
799       struct gen_mi_value end =
800          query_mem64(q, offsetof(struct iris_query_snapshots, end));
801       result = gen_mi_isub(&b, end, start);
802       break;
803    }
804    }
805 
806    result = inverted ? gen_mi_z(&b, result) : gen_mi_nz(&b, result);
807    result = gen_mi_iand(&b, result, gen_mi_imm(1));
808 
809    /* We immediately set the predicate on the render batch, as all the
810     * counters come from 3D operations.  However, we may need to predicate
811     * a compute dispatch, which executes in a different GEM context and has
812     * a different MI_PREDICATE_RESULT register.  So, we save the result to
813     * memory and reload it in iris_launch_grid.
814     */
815    gen_mi_value_ref(&b, result);
816    gen_mi_store(&b, gen_mi_reg32(MI_PREDICATE_RESULT), result);
817    gen_mi_store(&b, query_mem64(q, offsetof(struct iris_query_snapshots,
818                                             predicate_result)), result);
819    ice->state.compute_predicate = bo;
820 
821    iris_batch_sync_region_end(batch);
822 }
823 
824 static void
iris_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)825 iris_render_condition(struct pipe_context *ctx,
826                       struct pipe_query *query,
827                       bool condition,
828                       enum pipe_render_cond_flag mode)
829 {
830    struct iris_context *ice = (void *) ctx;
831    struct iris_query *q = (void *) query;
832 
833    /* The old condition isn't relevant; we'll update it if necessary */
834    ice->state.compute_predicate = NULL;
835    ice->condition.query = q;
836    ice->condition.condition = condition;
837 
838    if (!q) {
839       ice->state.predicate = IRIS_PREDICATE_STATE_RENDER;
840       return;
841    }
842 
843    iris_check_query_no_flush(ice, q);
844 
845    if (q->result || q->ready) {
846       set_predicate_enable(ice, (q->result != 0) ^ condition);
847    } else {
848       if (mode == PIPE_RENDER_COND_NO_WAIT ||
849           mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
850          perf_debug(&ice->dbg, "Conditional rendering demoted from "
851                     "\"no wait\" to \"wait\".");
852       }
853       set_predicate_for_result(ice, q, condition);
854    }
855 }
856 
857 static void
iris_resolve_conditional_render(struct iris_context * ice)858 iris_resolve_conditional_render(struct iris_context *ice)
859 {
860    struct pipe_context *ctx = (void *) ice;
861    struct iris_query *q = ice->condition.query;
862    struct pipe_query *query = (void *) q;
863    union pipe_query_result result;
864 
865    if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT)
866       return;
867 
868    assert(q);
869 
870    iris_get_query_result(ctx, query, true, &result);
871    set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
872 }
873 
874 void
genX(init_query)875 genX(init_query)(struct iris_context *ice)
876 {
877    struct pipe_context *ctx = &ice->ctx;
878    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
879 
880    ctx->create_query = iris_create_query;
881    ctx->create_batch_query = iris_create_batch_query;
882    ctx->destroy_query = iris_destroy_query;
883    ctx->begin_query = iris_begin_query;
884    ctx->end_query = iris_end_query;
885    ctx->get_query_result = iris_get_query_result;
886    ctx->get_query_result_resource = iris_get_query_result_resource;
887    ctx->set_active_query_state = iris_set_active_query_state;
888    ctx->render_condition = iris_render_condition;
889 
890    screen->vtbl.resolve_conditional_render = iris_resolve_conditional_render;
891 }
892