1 /*
2  * Copyright © 2020 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 
31 #include "util/macros.h"
32 #include "util/u_atomic.h"
33 #include "util/u_queue.h"
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
40  * mechanism, in that it allows for defining driver specific (or common)
41  * tracepoints, which generate 'trace_$name()' functions that can be
42  * called at various points in commandstream emit.
43  *
44  * Currently a printf backend is implemented, but the expectation is to
45  * also implement a perfetto backend for shipping out traces to a tool like
46  * AGI.
47  *
48  * Notable differences:
49  *
50  *  - GPU timestamps!  A driver provided callback is used to emit timestamps
51  *    to a buffer.  At a later point in time (when stalling to wait for the
52  *    GPU is not required), the timestamps are re-united with the trace
53  *    payload.  This makes the trace mechanism suitable for profiling.
54  *
55  *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
56  *    tracepoints is split into two stages.  Traces are emitted to a
57  *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
58  *    instance.  This avoids the requirement that commandstream containing
59  *    tracepoints is emitted in the same order as it is generated.
60  *
61  *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
62  *    then a `u_trace_context` per-engine should be used.
63  *
64  *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
65  *    from which header and src files are generated.  Since we already have
66  *    a build dependency on python+mako, this gives more flexibility than
67  *    clunky preprocessor macro magic.
68  *
69  */
70 
71 struct u_trace_context;
72 struct u_trace;
73 struct u_trace_chunk;
74 struct u_trace_printer;
75 
76 /**
77  * Special reserved value to indicate that no timestamp was captured,
78  * and that the timestamp of the previous trace should be reused.
79  */
80 #define U_TRACE_NO_TIMESTAMP ((uint64_t) 0)
81 
82 /**
83  * Driver provided callback to create a timestamp buffer which will be
84  * read by u_trace_read_ts function.
85  */
86 typedef void *(*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
87                                           uint32_t timestamps_count);
88 
89 /**
90  * Driver provided callback to delete a timestamp buffer.
91  */
92 typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
93                                          void *timestamps);
94 
95 /**
96  * Driver provided callback to emit commands into the soecified command
97  * stream to capture a 64b timestamp into the specified timestamps buffer,
98  * at the specified index.
99  *
100  * The hw counter that the driver records should be something that runs at
101  * a fixed rate, even as the GPU freq changes.  The same source used for
102  * GL_TIMESTAMP queries should be appropriate.
103  */
104 typedef void (*u_trace_record_ts)(struct u_trace *ut,
105                                   void *cs,
106                                   void *timestamps,
107                                   unsigned idx,
108                                   bool end_of_pipe);
109 
110 /**
111  * Driver provided callback to read back a previously recorded timestamp.
112  * If necessary, this should block until the GPU has finished writing back
113  * the timestamps.  (The timestamps will be read back in order, so it is
114  * safe to only synchronize on idx==0.)
115  *
116  * flush_data is data provided by the driver via u_trace_flush.
117  *
118  * The returned timestamp should be in units of nanoseconds.  The same
119  * timebase as GL_TIMESTAMP queries should be used.
120  *
121  * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
122  * that no timestamp was captured and the timestamp from the previous trace
123  * will be re-used.  (The first trace in the u_trace buf may not do this.)
124  * This allows the driver to detect cases where multiple tracepoints are
125  * emitted with no other intervening cmdstream, to avoid pointlessly
126  * capturing the same timestamp multiple times in a row.
127  */
128 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
129                                     void *timestamps,
130                                     unsigned idx,
131                                     void *flush_data);
132 
133 /**
134  * Driver provided callback to delete flush data.
135  */
136 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
137                                           void *flush_data);
138 
139 enum u_trace_type {
140    U_TRACE_TYPE_PRINT = 1u << 0,
141    U_TRACE_TYPE_JSON = 1u << 1,
142    U_TRACE_TYPE_PERFETTO_ACTIVE = 1u << 2,
143    U_TRACE_TYPE_PERFETTO_ENV = 1u << 3,
144    U_TRACE_TYPE_MARKERS = 1u << 4,
145 
146    U_TRACE_TYPE_PRINT_JSON = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_JSON,
147    U_TRACE_TYPE_PERFETTO =
148       U_TRACE_TYPE_PERFETTO_ACTIVE | U_TRACE_TYPE_PERFETTO_ENV,
149 
150    /*
151     * A mask of traces that require appending to the tracepoint chunk list.
152     */
153    U_TRACE_TYPE_REQUIRE_QUEUING = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO,
154    /*
155     * A mask of traces that require processing the tracepoint chunk list.
156     */
157    U_TRACE_TYPE_REQUIRE_PROCESSING =
158       U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO_ACTIVE,
159 };
160 
161 /**
162  * The trace context provides tracking for "in-flight" traces, once the
163  * cmdstream that records timestamps has been flushed.
164  */
165 struct u_trace_context {
166    /* All traces enabled in this context */
167    enum u_trace_type enabled_traces;
168 
169    void *pctx;
170 
171    u_trace_create_ts_buffer create_timestamp_buffer;
172    u_trace_delete_ts_buffer delete_timestamp_buffer;
173    u_trace_record_ts record_timestamp;
174    u_trace_read_ts read_timestamp;
175    u_trace_delete_flush_data delete_flush_data;
176 
177    FILE *out;
178    struct u_trace_printer *out_printer;
179 
180    /* Once u_trace_flush() is called u_trace_chunk's are queued up to
181     * render tracepoints on a queue.  The per-chunk queue jobs block until
182     * timestamps are available.
183     */
184    struct util_queue queue;
185 
186 #ifdef HAVE_PERFETTO
187    /* node in global list of trace contexts. */
188    struct list_head node;
189 #endif
190 
191    /* State to accumulate time across N chunks associated with a single
192     * batch (u_trace).
193     */
194    uint64_t last_time_ns;
195    uint64_t first_time_ns;
196 
197    uint32_t frame_nr;
198    uint32_t batch_nr;
199    uint32_t event_nr;
200    bool start_of_frame;
201 
202    /* list of unprocessed trace chunks in fifo order: */
203    struct list_head flushed_trace_chunks;
204 };
205 
206 /**
207  * The u_trace ptr is passed as the first arg to generated tracepoints.
208  * It provides buffering for tracepoint payload until the corresponding
209  * driver cmdstream containing the emitted commands to capture is
210  * flushed.
211  *
212  * Individual tracepoints emitted to u_trace are expected to be "executed"
213  * (ie. timestamp captured) in FIFO order with respect to other tracepoints
214  * emitted to the same u_trace.  But the order WRT other u_trace instances
215  * is undefined util u_trace_flush().
216  */
217 struct u_trace {
218    struct u_trace_context *utctx;
219 
220    struct list_head
221       trace_chunks; /* list of unflushed trace chunks in fifo order */
222 };
223 
224 void u_trace_context_init(struct u_trace_context *utctx,
225                           void *pctx,
226                           u_trace_create_ts_buffer create_timestamp_buffer,
227                           u_trace_delete_ts_buffer delete_timestamp_buffer,
228                           u_trace_record_ts record_timestamp,
229                           u_trace_read_ts read_timestamp,
230                           u_trace_delete_flush_data delete_flush_data);
231 void u_trace_context_fini(struct u_trace_context *utctx);
232 
233 /**
234  * Flush (trigger processing) of traces previously flushed to the
235  * trace-context by u_trace_flush().
236  *
237  * This should typically be called in the driver's pctx->flush().
238  */
239 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
240 
241 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
242 void u_trace_fini(struct u_trace *ut);
243 
244 void u_trace_state_init(void);
245 bool u_trace_is_enabled(enum u_trace_type type);
246 
247 bool u_trace_has_points(struct u_trace *ut);
248 
249 struct u_trace_iterator {
250    struct u_trace *ut;
251    struct u_trace_chunk *chunk;
252    uint32_t event_idx;
253 };
254 
255 struct u_trace_iterator u_trace_begin_iterator(struct u_trace *ut);
256 
257 struct u_trace_iterator u_trace_end_iterator(struct u_trace *ut);
258 
259 bool u_trace_iterator_equal(struct u_trace_iterator a,
260                             struct u_trace_iterator b);
261 
262 typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
263                                        void *cmdstream,
264                                        void *ts_from,
265                                        uint32_t from_offset,
266                                        void *ts_to,
267                                        uint32_t to_offset,
268                                        uint32_t count);
269 
270 /**
271  * Clones tracepoints range into target u_trace.
272  * Provides callback for driver to copy timestamps on GPU from
273  * one buffer to another.
274  *
275  * It allows:
276  * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
277  *   each time it is submitted.
278  * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
279  *   corresponding to a tile.
280  */
281 void u_trace_clone_append(struct u_trace_iterator begin_it,
282                           struct u_trace_iterator end_it,
283                           struct u_trace *into,
284                           void *cmdstream,
285                           u_trace_copy_ts_buffer copy_ts_buffer);
286 
287 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
288                                  struct u_trace_iterator end_it);
289 
290 /**
291  * Flush traces to the parent trace-context.  At this point, the expectation
292  * is that all the tracepoints are "executed" by the GPU following any
293  * previously flushed u_trace batch.
294  *
295  * flush_data is a way for driver to pass additional data, which becomes
296  * available only at the point of flush, to the u_trace_read_ts callback and
297  * perfetto. The typical example of such data would be a fence to wait on in
298  * u_trace_read_ts, and a submission_id to pass into perfetto. The destruction
299  * of the data is done via u_trace_delete_flush_data.
300  *
301  * This should typically be called when the corresponding cmdstream
302  * (containing the timestamp reads) is flushed to the kernel.
303  */
304 void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
305 
306 #ifdef HAVE_PERFETTO
307 static ALWAYS_INLINE bool
u_trace_perfetto_active(struct u_trace_context * utctx)308 u_trace_perfetto_active(struct u_trace_context *utctx)
309 {
310    return p_atomic_read_relaxed(&utctx->enabled_traces) &
311           U_TRACE_TYPE_PERFETTO_ACTIVE;
312 }
313 
314 void u_trace_perfetto_start(void);
315 void u_trace_perfetto_stop(void);
316 #else
317 static ALWAYS_INLINE bool
u_trace_perfetto_active(UNUSED struct u_trace_context * utctx)318 u_trace_perfetto_active(UNUSED struct u_trace_context *utctx)
319 {
320    return false;
321 }
322 #endif
323 
324 /**
325  * Return whether utrace is enabled at all or not, this can be used to
326  * gate any expensive traces.
327  */
328 static ALWAYS_INLINE bool
u_trace_enabled(struct u_trace_context * utctx)329 u_trace_enabled(struct u_trace_context *utctx)
330 {
331    return p_atomic_read_relaxed(&utctx->enabled_traces) != 0;
332 }
333 
334 /**
335  * Return whether chunks should be processed or not.
336  */
337 static ALWAYS_INLINE bool
u_trace_should_process(struct u_trace_context * utctx)338 u_trace_should_process(struct u_trace_context *utctx)
339 {
340    return p_atomic_read_relaxed(&utctx->enabled_traces) &
341           U_TRACE_TYPE_REQUIRE_PROCESSING;
342 }
343 
344 /**
345  * Return whether to emit markers into the command stream even if the queue
346  * isn't active.
347  */
348 static ALWAYS_INLINE bool
u_trace_markers_enabled(struct u_trace_context * utctx)349 u_trace_markers_enabled(struct u_trace_context *utctx)
350 {
351    return p_atomic_read_relaxed(&utctx->enabled_traces) &
352           U_TRACE_TYPE_MARKERS;
353 }
354 
355 #ifdef __cplusplus
356 }
357 #endif
358 
359 #endif /* _U_TRACE_H */
360