1 /*
2 * Copyright © 2020 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30
31 #include "util/macros.h"
32 #include "util/u_atomic.h"
33 #include "util/u_queue.h"
34
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38
39 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
40 * mechanism, in that it allows for defining driver specific (or common)
41 * tracepoints, which generate 'trace_$name()' functions that can be
42 * called at various points in commandstream emit.
43 *
44 * Currently a printf backend is implemented, but the expectation is to
45 * also implement a perfetto backend for shipping out traces to a tool like
46 * AGI.
47 *
48 * Notable differences:
49 *
50 * - GPU timestamps! A driver provided callback is used to emit timestamps
51 * to a buffer. At a later point in time (when stalling to wait for the
52 * GPU is not required), the timestamps are re-united with the trace
53 * payload. This makes the trace mechanism suitable for profiling.
54 *
55 * - Instead of a systemwide trace ringbuffer, buffering of un-retired
56 * tracepoints is split into two stages. Traces are emitted to a
57 * 'u_trace' instance, and at a later time flushed to a 'u_trace_context'
58 * instance. This avoids the requirement that commandstream containing
59 * tracepoints is emitted in the same order as it is generated.
60 *
61 * If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
62 * then a `u_trace_context` per-engine should be used.
63 *
64 * - Unlike kernel tracepoints, u_trace tracepoints are defined in py
65 * from which header and src files are generated. Since we already have
66 * a build dependency on python+mako, this gives more flexibility than
67 * clunky preprocessor macro magic.
68 *
69 */
70
71 struct u_trace_context;
72 struct u_trace;
73 struct u_trace_chunk;
74 struct u_trace_printer;
75
76 /**
77 * Special reserved value to indicate that no timestamp was captured,
78 * and that the timestamp of the previous trace should be reused.
79 */
80 #define U_TRACE_NO_TIMESTAMP ((uint64_t) 0)
81
82 /**
83 * Driver provided callback to create a timestamp buffer which will be
84 * read by u_trace_read_ts function.
85 */
86 typedef void *(*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
87 uint32_t timestamps_count);
88
89 /**
90 * Driver provided callback to delete a timestamp buffer.
91 */
92 typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
93 void *timestamps);
94
95 /**
96 * Driver provided callback to emit commands into the soecified command
97 * stream to capture a 64b timestamp into the specified timestamps buffer,
98 * at the specified index.
99 *
100 * The hw counter that the driver records should be something that runs at
101 * a fixed rate, even as the GPU freq changes. The same source used for
102 * GL_TIMESTAMP queries should be appropriate.
103 */
104 typedef void (*u_trace_record_ts)(struct u_trace *ut,
105 void *cs,
106 void *timestamps,
107 unsigned idx,
108 bool end_of_pipe);
109
110 /**
111 * Driver provided callback to read back a previously recorded timestamp.
112 * If necessary, this should block until the GPU has finished writing back
113 * the timestamps. (The timestamps will be read back in order, so it is
114 * safe to only synchronize on idx==0.)
115 *
116 * flush_data is data provided by the driver via u_trace_flush.
117 *
118 * The returned timestamp should be in units of nanoseconds. The same
119 * timebase as GL_TIMESTAMP queries should be used.
120 *
121 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
122 * that no timestamp was captured and the timestamp from the previous trace
123 * will be re-used. (The first trace in the u_trace buf may not do this.)
124 * This allows the driver to detect cases where multiple tracepoints are
125 * emitted with no other intervening cmdstream, to avoid pointlessly
126 * capturing the same timestamp multiple times in a row.
127 */
128 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
129 void *timestamps,
130 unsigned idx,
131 void *flush_data);
132
133 /**
134 * Driver provided callback to delete flush data.
135 */
136 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
137 void *flush_data);
138
139 enum u_trace_type {
140 U_TRACE_TYPE_PRINT = 1u << 0,
141 U_TRACE_TYPE_JSON = 1u << 1,
142 U_TRACE_TYPE_PERFETTO_ACTIVE = 1u << 2,
143 U_TRACE_TYPE_PERFETTO_ENV = 1u << 3,
144 U_TRACE_TYPE_MARKERS = 1u << 4,
145
146 U_TRACE_TYPE_PRINT_JSON = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_JSON,
147 U_TRACE_TYPE_PERFETTO =
148 U_TRACE_TYPE_PERFETTO_ACTIVE | U_TRACE_TYPE_PERFETTO_ENV,
149
150 /*
151 * A mask of traces that require appending to the tracepoint chunk list.
152 */
153 U_TRACE_TYPE_REQUIRE_QUEUING = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO,
154 /*
155 * A mask of traces that require processing the tracepoint chunk list.
156 */
157 U_TRACE_TYPE_REQUIRE_PROCESSING =
158 U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO_ACTIVE,
159 };
160
161 /**
162 * The trace context provides tracking for "in-flight" traces, once the
163 * cmdstream that records timestamps has been flushed.
164 */
165 struct u_trace_context {
166 /* All traces enabled in this context */
167 enum u_trace_type enabled_traces;
168
169 void *pctx;
170
171 u_trace_create_ts_buffer create_timestamp_buffer;
172 u_trace_delete_ts_buffer delete_timestamp_buffer;
173 u_trace_record_ts record_timestamp;
174 u_trace_read_ts read_timestamp;
175 u_trace_delete_flush_data delete_flush_data;
176
177 FILE *out;
178 struct u_trace_printer *out_printer;
179
180 /* Once u_trace_flush() is called u_trace_chunk's are queued up to
181 * render tracepoints on a queue. The per-chunk queue jobs block until
182 * timestamps are available.
183 */
184 struct util_queue queue;
185
186 #ifdef HAVE_PERFETTO
187 /* node in global list of trace contexts. */
188 struct list_head node;
189 #endif
190
191 /* State to accumulate time across N chunks associated with a single
192 * batch (u_trace).
193 */
194 uint64_t last_time_ns;
195 uint64_t first_time_ns;
196
197 uint32_t frame_nr;
198 uint32_t batch_nr;
199 uint32_t event_nr;
200 bool start_of_frame;
201
202 /* list of unprocessed trace chunks in fifo order: */
203 struct list_head flushed_trace_chunks;
204 };
205
206 /**
207 * The u_trace ptr is passed as the first arg to generated tracepoints.
208 * It provides buffering for tracepoint payload until the corresponding
209 * driver cmdstream containing the emitted commands to capture is
210 * flushed.
211 *
212 * Individual tracepoints emitted to u_trace are expected to be "executed"
213 * (ie. timestamp captured) in FIFO order with respect to other tracepoints
214 * emitted to the same u_trace. But the order WRT other u_trace instances
215 * is undefined util u_trace_flush().
216 */
217 struct u_trace {
218 struct u_trace_context *utctx;
219
220 struct list_head
221 trace_chunks; /* list of unflushed trace chunks in fifo order */
222 };
223
224 void u_trace_context_init(struct u_trace_context *utctx,
225 void *pctx,
226 u_trace_create_ts_buffer create_timestamp_buffer,
227 u_trace_delete_ts_buffer delete_timestamp_buffer,
228 u_trace_record_ts record_timestamp,
229 u_trace_read_ts read_timestamp,
230 u_trace_delete_flush_data delete_flush_data);
231 void u_trace_context_fini(struct u_trace_context *utctx);
232
233 /**
234 * Flush (trigger processing) of traces previously flushed to the
235 * trace-context by u_trace_flush().
236 *
237 * This should typically be called in the driver's pctx->flush().
238 */
239 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
240
241 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
242 void u_trace_fini(struct u_trace *ut);
243
244 void u_trace_state_init(void);
245 bool u_trace_is_enabled(enum u_trace_type type);
246
247 bool u_trace_has_points(struct u_trace *ut);
248
249 struct u_trace_iterator {
250 struct u_trace *ut;
251 struct u_trace_chunk *chunk;
252 uint32_t event_idx;
253 };
254
255 struct u_trace_iterator u_trace_begin_iterator(struct u_trace *ut);
256
257 struct u_trace_iterator u_trace_end_iterator(struct u_trace *ut);
258
259 bool u_trace_iterator_equal(struct u_trace_iterator a,
260 struct u_trace_iterator b);
261
262 typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
263 void *cmdstream,
264 void *ts_from,
265 uint32_t from_offset,
266 void *ts_to,
267 uint32_t to_offset,
268 uint32_t count);
269
270 /**
271 * Clones tracepoints range into target u_trace.
272 * Provides callback for driver to copy timestamps on GPU from
273 * one buffer to another.
274 *
275 * It allows:
276 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
277 * each time it is submitted.
278 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
279 * corresponding to a tile.
280 */
281 void u_trace_clone_append(struct u_trace_iterator begin_it,
282 struct u_trace_iterator end_it,
283 struct u_trace *into,
284 void *cmdstream,
285 u_trace_copy_ts_buffer copy_ts_buffer);
286
287 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
288 struct u_trace_iterator end_it);
289
290 /**
291 * Flush traces to the parent trace-context. At this point, the expectation
292 * is that all the tracepoints are "executed" by the GPU following any
293 * previously flushed u_trace batch.
294 *
295 * flush_data is a way for driver to pass additional data, which becomes
296 * available only at the point of flush, to the u_trace_read_ts callback and
297 * perfetto. The typical example of such data would be a fence to wait on in
298 * u_trace_read_ts, and a submission_id to pass into perfetto. The destruction
299 * of the data is done via u_trace_delete_flush_data.
300 *
301 * This should typically be called when the corresponding cmdstream
302 * (containing the timestamp reads) is flushed to the kernel.
303 */
304 void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
305
306 #ifdef HAVE_PERFETTO
307 static ALWAYS_INLINE bool
u_trace_perfetto_active(struct u_trace_context * utctx)308 u_trace_perfetto_active(struct u_trace_context *utctx)
309 {
310 return p_atomic_read_relaxed(&utctx->enabled_traces) &
311 U_TRACE_TYPE_PERFETTO_ACTIVE;
312 }
313
314 void u_trace_perfetto_start(void);
315 void u_trace_perfetto_stop(void);
316 #else
317 static ALWAYS_INLINE bool
u_trace_perfetto_active(UNUSED struct u_trace_context * utctx)318 u_trace_perfetto_active(UNUSED struct u_trace_context *utctx)
319 {
320 return false;
321 }
322 #endif
323
324 /**
325 * Return whether utrace is enabled at all or not, this can be used to
326 * gate any expensive traces.
327 */
328 static ALWAYS_INLINE bool
u_trace_enabled(struct u_trace_context * utctx)329 u_trace_enabled(struct u_trace_context *utctx)
330 {
331 return p_atomic_read_relaxed(&utctx->enabled_traces) != 0;
332 }
333
334 /**
335 * Return whether chunks should be processed or not.
336 */
337 static ALWAYS_INLINE bool
u_trace_should_process(struct u_trace_context * utctx)338 u_trace_should_process(struct u_trace_context *utctx)
339 {
340 return p_atomic_read_relaxed(&utctx->enabled_traces) &
341 U_TRACE_TYPE_REQUIRE_PROCESSING;
342 }
343
344 /**
345 * Return whether to emit markers into the command stream even if the queue
346 * isn't active.
347 */
348 static ALWAYS_INLINE bool
u_trace_markers_enabled(struct u_trace_context * utctx)349 u_trace_markers_enabled(struct u_trace_context *utctx)
350 {
351 return p_atomic_read_relaxed(&utctx->enabled_traces) &
352 U_TRACE_TYPE_MARKERS;
353 }
354
355 #ifdef __cplusplus
356 }
357 #endif
358
359 #endif /* _U_TRACE_H */
360