1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "freedreno_query_hw.h"
28 #include "freedreno_context.h"
29 #include "freedreno_util.h"
30 
31 #include "fd4_query.h"
32 #include "fd4_context.h"
33 #include "fd4_draw.h"
34 #include "fd4_format.h"
35 
36 
37 struct fd_rb_samp_ctrs {
38 	uint64_t ctr[16];
39 };
40 
41 /*
42  * Occlusion Query:
43  *
44  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
45  * interpret results
46  */
47 
48 static struct fd_hw_sample *
occlusion_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)49 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
50 {
51 	struct fd_hw_sample *samp =
52 			fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
53 
54 	/* low bits of sample addr should be zero (since they are control
55 	 * flags in RB_SAMPLE_COUNT_CONTROL):
56 	 */
57 	debug_assert((samp->offset & 0x3) == 0);
58 
59 	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
60 	 * HW_QUERY_BASE_REG register:
61 	 */
62 	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
63 	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
64 	OUT_RING(ring, HW_QUERY_BASE_REG);
65 	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
66 			samp->offset);
67 
68 	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
69 	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
70 						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
71 	OUT_RING(ring, 1);             /* NumInstances */
72 	OUT_RING(ring, 0);             /* NumIndices */
73 
74 	fd_event_write(batch, ring, ZPASS_DONE);
75 
76 	return samp;
77 }
78 
79 static uint64_t
count_samples(const struct fd_rb_samp_ctrs * start,const struct fd_rb_samp_ctrs * end)80 count_samples(const struct fd_rb_samp_ctrs *start,
81 		const struct fd_rb_samp_ctrs *end)
82 {
83 	return end->ctr[0] - start->ctr[0];
84 }
85 
86 static void
occlusion_counter_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)87 occlusion_counter_accumulate_result(struct fd_context *ctx,
88 		const void *start, const void *end,
89 		union pipe_query_result *result)
90 {
91 	uint64_t n = count_samples(start, end);
92 	result->u64 += n;
93 }
94 
95 static void
occlusion_predicate_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)96 occlusion_predicate_accumulate_result(struct fd_context *ctx,
97 		const void *start, const void *end,
98 		union pipe_query_result *result)
99 {
100 	uint64_t n = count_samples(start, end);
101 	result->b |= (n > 0);
102 }
103 
104 /*
105  * Time Elapsed Query:
106  *
107  * Note: we could in theory support timestamp queries, but they
108  * won't give sensible results for tilers.
109  */
110 
111 static void
time_elapsed_enable(struct fd_context * ctx,struct fd_ringbuffer * ring)112 time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
113 {
114 	/* Right now, the assignment of countable to counter register is
115 	 * just hard coded.  If we start exposing more countables than we
116 	 * have counters, we will need to be more clever.
117 	 */
118 	struct fd_batch *batch = fd_context_batch(ctx);
119 	fd_wfi(batch, ring);
120 	OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
121 	OUT_RING(ring, CP_ALWAYS_COUNT);
122 }
123 
124 static struct fd_hw_sample *
time_elapsed_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)125 time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
126 {
127 	struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
128 
129 	/* use unused part of vsc_size_mem as scratch space, to avoid
130 	 * extra allocation:
131 	 */
132 	struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
133 	const int sample_off = 128;
134 	const int addr_off = sample_off + 8;
135 
136 	debug_assert(batch->ctx->screen->max_freq > 0);
137 
138 	/* Basic issue is that we need to read counter value to a relative
139 	 * destination (with per-tile offset) rather than absolute dest
140 	 * addr.  But there is no pm4 packet that can do that.  This is
141 	 * where it would be *really* nice if we could write our own fw
142 	 * since afaict implementing the sort of packet we need would be
143 	 * trivial.
144 	 *
145 	 * Instead, we:
146 	 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
147 	 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
148 	 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
149 	 *     address to the per-sample offset in the scratch buffer
150 	 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
151 	 *     to CP_ME_NRT_ADDR
152 	 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
153 	 *     buffer to CP_ME_NRT_DATA to trigger the write out to query
154 	 *     result buffer
155 	 *
156 	 * Straightforward, right?
157 	 *
158 	 * Maybe could swap the order of things in the scratch buffer to
159 	 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
160 	 * shot, but that's really just polishing a turd..
161 	 */
162 
163 	fd_wfi(batch, ring);
164 
165 	/* copy sample counter _LO and _HI to scratch: */
166 	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
167 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
168 			CP_REG_TO_MEM_0_64B |
169 			CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
170 	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
171 
172 	/* ok... here we really *would* like to use the CP_SET_CONSTANT
173 	 * mode which can add a constant to value in reg2 and write to
174 	 * reg1... *but* that only works for banked/context registers,
175 	 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
176 	 * CP math to the scratch buffer instead:
177 	 *
178 	 * (note first 8 bytes are counter value, use offset 0x8 for
179 	 * address calculation)
180 	 */
181 
182 	/* per-sample offset to scratch bo: */
183 	OUT_PKT3(ring, CP_MEM_WRITE, 2);
184 	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
185 	OUT_RING(ring, samp->offset);
186 
187 	/* now add to that the per-tile base: */
188 	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
189 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
190 			CP_REG_TO_MEM_0_ACCUMULATE |
191 			CP_REG_TO_MEM_0_CNT(0));       /* readback 1 regs */
192 	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
193 
194 	/* now copy that back to CP_ME_NRT_ADDR: */
195 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
196 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
197 	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
198 
199 	/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
200 	 * to trigger the write to result buffer
201 	 */
202 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
203 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
204 	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
205 
206 	/* and again to get the value of the _HI reg from scratch: */
207 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
208 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
209 	OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
210 
211 	/* Sigh.. */
212 
213 	return samp;
214 }
215 
216 static void
time_elapsed_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)217 time_elapsed_accumulate_result(struct fd_context *ctx,
218 		const void *start, const void *end,
219 		union pipe_query_result *result)
220 {
221 	uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
222 	/* max_freq is in Hz, convert cycle count to ns: */
223 	result->u64 += n * 1000000000 / ctx->screen->max_freq;
224 }
225 
226 static void
timestamp_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)227 timestamp_accumulate_result(struct fd_context *ctx,
228 		const void *start, const void *end,
229 		union pipe_query_result *result)
230 {
231 	/* just return the value from fist tile: */
232 	if (result->u64 != 0)
233 		return;
234 	uint64_t n = *(uint64_t *)start;
235 	/* max_freq is in Hz, convert cycle count to ns: */
236 	result->u64 = n * 1000000000 / ctx->screen->max_freq;
237 }
238 
239 static const struct fd_hw_sample_provider occlusion_counter = {
240 		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
241 		.active = FD_STAGE_DRAW,
242 		.get_sample = occlusion_get_sample,
243 		.accumulate_result = occlusion_counter_accumulate_result,
244 };
245 
246 static const struct fd_hw_sample_provider occlusion_predicate = {
247 		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
248 		.active = FD_STAGE_DRAW,
249 		.get_sample = occlusion_get_sample,
250 		.accumulate_result = occlusion_predicate_accumulate_result,
251 };
252 
253 static const struct fd_hw_sample_provider occlusion_predicate_conservative = {
254 		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
255 		.active = FD_STAGE_DRAW,
256 		.get_sample = occlusion_get_sample,
257 		.accumulate_result = occlusion_predicate_accumulate_result,
258 };
259 
260 static const struct fd_hw_sample_provider time_elapsed = {
261 		.query_type = PIPE_QUERY_TIME_ELAPSED,
262 		.active = FD_STAGE_ALL,
263 		.enable = time_elapsed_enable,
264 		.get_sample = time_elapsed_get_sample,
265 		.accumulate_result = time_elapsed_accumulate_result,
266 };
267 
268 /* NOTE: timestamp query isn't going to give terribly sensible results
269  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
270  * add in a binning pass, the results get even more non-sensical.  So
271  * we just return the timestamp on the first tile and hope that is
272  * kind of good enough.
273  */
274 static const struct fd_hw_sample_provider timestamp = {
275 		.query_type = PIPE_QUERY_TIMESTAMP,
276 		.active = FD_STAGE_ALL,
277 		.enable = time_elapsed_enable,
278 		.get_sample = time_elapsed_get_sample,
279 		.accumulate_result = timestamp_accumulate_result,
280 };
281 
fd4_query_context_init(struct pipe_context * pctx)282 void fd4_query_context_init(struct pipe_context *pctx)
283 {
284 	struct fd_context *ctx = fd_context(pctx);
285 
286 	ctx->create_query = fd_hw_create_query;
287 	ctx->query_prepare = fd_hw_query_prepare;
288 	ctx->query_prepare_tile = fd_hw_query_prepare_tile;
289 	ctx->query_set_stage = fd_hw_query_set_stage;
290 
291 	fd_hw_query_register_provider(pctx, &occlusion_counter);
292 	fd_hw_query_register_provider(pctx, &occlusion_predicate);
293 	fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
294 	fd_hw_query_register_provider(pctx, &time_elapsed);
295 	fd_hw_query_register_provider(pctx, &timestamp);
296 }
297