1 /*
2  * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #ifndef FREEDRENO_RINGBUFFER_H_
28 #define FREEDRENO_RINGBUFFER_H_
29 
30 #include <stdio.h>
31 #include "util/u_debug.h"
32 #include "util/u_dynarray.h"
33 
34 #include "freedreno_drmif.h"
35 #include "adreno_common.xml.h"
36 #include "adreno_pm4.xml.h"
37 
38 struct fd_submit;
39 struct fd_ringbuffer;
40 
41 enum fd_ringbuffer_flags {
42 
43 	/* Primary ringbuffer for a submit, ie. an IB1 level rb
44 	 * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH
45 	 * packets.
46 	 */
47 	FD_RINGBUFFER_PRIMARY = 0x1,
48 
49 	/* Hint that the stateobj will be used for streaming state
50 	 * that is used once or a few times and then discarded.
51 	 *
52 	 * For sub-allocation, non streaming stateobj's should be
53 	 * sub-allocated from a page size buffer, so one long lived
54 	 * state obj doesn't prevent other pages from being freed.
55 	 * (Ie. it would be no worse than allocating a page sized
56 	 * bo for each small non-streaming stateobj).
57 	 *
58 	 * But streaming stateobj's could be sub-allocated from a
59 	 * larger buffer to reduce the alloc/del overhead.
60 	 */
61 	FD_RINGBUFFER_STREAMING = 0x2,
62 
63 	/* Indicates that "growable" cmdstream can be used,
64 	 * consisting of multiple physical cmdstream buffers
65 	 */
66 	FD_RINGBUFFER_GROWABLE = 0x4,
67 
68 	/* Internal use only: */
69 	_FD_RINGBUFFER_OBJECT = 0x8,
70 };
71 
72 /* A submit object manages/tracks all the state buildup for a "submit"
73  * ioctl to the kernel.  Additionally, with the exception of long-lived
74  * non-STREAMING stateobj rb's, rb's are allocated from the submit.
75  */
76 struct fd_submit * fd_submit_new(struct fd_pipe *pipe);
77 
78 /* NOTE: all ringbuffer's create from the submit should be unref'd
79  * before destroying the submit.
80  */
81 void fd_submit_del(struct fd_submit *submit);
82 
83 /* Allocate a new rb from the submit. */
84 struct fd_ringbuffer * fd_submit_new_ringbuffer(struct fd_submit *submit,
85 		uint32_t size, enum fd_ringbuffer_flags flags);
86 
87 /* in_fence_fd: -1 for no in-fence, else fence fd
88  * out_fence_fd: NULL for no output-fence requested, else ptr to return out-fence
89  */
90 int fd_submit_flush(struct fd_submit *submit,
91 		int in_fence_fd, int *out_fence_fd,
92 		uint32_t *out_fence);
93 
94 struct fd_ringbuffer;
95 struct fd_reloc;
96 
97 struct fd_ringbuffer_funcs {
98 	void (*grow)(struct fd_ringbuffer *ring, uint32_t size);
99 	void (*emit_reloc)(struct fd_ringbuffer *ring,
100 			const struct fd_reloc *reloc);
101 	uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring,
102 			struct fd_ringbuffer *target, uint32_t cmd_idx);
103 	uint32_t (*cmd_count)(struct fd_ringbuffer *ring);
104 	void (*destroy)(struct fd_ringbuffer *ring);
105 };
106 
107 /* the ringbuffer object is not opaque so that OUT_RING() type stuff
108  * can be inlined.  Note that users should not make assumptions about
109  * the size of this struct.
110  */
111 struct fd_ringbuffer {
112 	uint32_t *cur, *end, *start;
113 	const struct fd_ringbuffer_funcs *funcs;
114 
115 // size or end coudl probably go away
116 	int size;
117 	int32_t refcnt;
118 	enum fd_ringbuffer_flags flags;
119 };
120 
121 /* Allocate a new long-lived state object, not associated with
122  * a submit:
123  */
124 struct fd_ringbuffer * fd_ringbuffer_new_object(struct fd_pipe *pipe,
125 		uint32_t size);
126 
127 static inline void
fd_ringbuffer_del(struct fd_ringbuffer * ring)128 fd_ringbuffer_del(struct fd_ringbuffer *ring)
129 {
130 	if (--ring->refcnt > 0)
131 		return;
132 
133 	ring->funcs->destroy(ring);
134 }
135 
136 static inline
137 struct fd_ringbuffer *
fd_ringbuffer_ref(struct fd_ringbuffer * ring)138 fd_ringbuffer_ref(struct fd_ringbuffer *ring)
139 {
140 	ring->refcnt++;
141 	return ring;
142 }
143 
144 static inline void
fd_ringbuffer_grow(struct fd_ringbuffer * ring,uint32_t ndwords)145 fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords)
146 {
147 	assert(ring->funcs->grow);     /* unsupported on kgsl */
148 
149 	/* there is an upper bound on IB size, which appears to be 0x100000 */
150 	if (ring->size < 0x100000)
151 		ring->size *= 2;
152 
153 	ring->funcs->grow(ring, ring->size);
154 }
155 
156 static inline void
fd_ringbuffer_emit(struct fd_ringbuffer * ring,uint32_t data)157 fd_ringbuffer_emit(struct fd_ringbuffer *ring,
158 		uint32_t data)
159 {
160 	(*ring->cur++) = data;
161 }
162 
163 struct fd_reloc {
164 	struct fd_bo *bo;
165 #define FD_RELOC_READ             0x0001
166 #define FD_RELOC_WRITE            0x0002
167 #define FD_RELOC_DUMP             0x0004
168 	uint32_t offset;
169 	uint32_t or;
170 	int32_t  shift;
171 	uint32_t orhi;      /* used for a5xx+ */
172 };
173 
174 /* We always mark BOs for write, instead of tracking it across reloc
175  * sources in userspace.  On the kernel side, this means we track a single
176  * excl fence in the BO instead of a set of read fences, which is cheaper.
177  * The downside is that a dmabuf-shared device won't be able to read in
178  * parallel with a read-only access by freedreno, but most other drivers
179  * have decided that that usecase isn't important enough to do this
180  * tracking, as well.
181  */
182 #define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE)
183 
184 /* NOTE: relocs are 2 dwords on a5xx+ */
185 
186 static inline void
fd_ringbuffer_reloc(struct fd_ringbuffer * ring,const struct fd_reloc * reloc)187 fd_ringbuffer_reloc(struct fd_ringbuffer *ring,
188 		const struct fd_reloc *reloc)
189 {
190 	ring->funcs->emit_reloc(ring, reloc);
191 }
192 
193 static inline uint32_t
fd_ringbuffer_cmd_count(struct fd_ringbuffer * ring)194 fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
195 {
196 	if (!ring->funcs->cmd_count)
197 		return 1;
198 	return ring->funcs->cmd_count(ring);
199 }
200 
201 static inline uint32_t
fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer * ring,struct fd_ringbuffer * target,uint32_t cmd_idx)202 fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring,
203 		struct fd_ringbuffer *target, uint32_t cmd_idx)
204 {
205 	return ring->funcs->emit_reloc_ring(ring, target, cmd_idx);
206 }
207 
208 static inline uint32_t
offset_bytes(void * end,void * start)209 offset_bytes(void *end, void *start)
210 {
211 	return ((char *)end) - ((char *)start);
212 }
213 
214 static inline uint32_t
fd_ringbuffer_size(struct fd_ringbuffer * ring)215 fd_ringbuffer_size(struct fd_ringbuffer *ring)
216 {
217 	/* only really needed for stateobj ringbuffers, and won't really
218 	 * do what you expect for growable rb's.. so lets just restrict
219 	 * this to stateobj's for now:
220 	 */
221 	debug_assert(!(ring->flags & FD_RINGBUFFER_GROWABLE));
222 	return offset_bytes(ring->cur, ring->start);
223 }
224 
225 #define LOG_DWORDS 0
226 
227 static inline void
OUT_RING(struct fd_ringbuffer * ring,uint32_t data)228 OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
229 {
230 	if (LOG_DWORDS) {
231 		fprintf(stderr, "ring[%p]: OUT_RING   %04x:  %08x", ring,
232 				(uint32_t)(ring->cur - ring->start), data);
233 	}
234 	fd_ringbuffer_emit(ring, data);
235 }
236 
237 /*
238  * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+
239  */
240 static inline void
OUT_RELOC(struct fd_ringbuffer * ring,struct fd_bo * bo,uint32_t offset,uint64_t or,int32_t shift)241 OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
242 		uint32_t offset, uint64_t or, int32_t shift)
243 {
244 	if (LOG_DWORDS) {
245 		fprintf(stderr, "ring[%p]: OUT_RELOC   %04x:  %p+%u << %d", ring,
246 				(uint32_t)(ring->cur - ring->start), bo, offset, shift);
247 	}
248 	debug_assert(offset < fd_bo_size(bo));
249 	fd_ringbuffer_reloc(ring, &(struct fd_reloc){
250 		.bo = bo,
251 		.offset = offset,
252 		.or = or,
253 		.shift = shift,
254 		.orhi = or >> 32,
255 	});
256 }
257 
258 static inline void
OUT_RB(struct fd_ringbuffer * ring,struct fd_ringbuffer * target)259 OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
260 {
261 	fd_ringbuffer_emit_reloc_ring_full(ring, target, 0);
262 }
263 
BEGIN_RING(struct fd_ringbuffer * ring,uint32_t ndwords)264 static inline void BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
265 {
266 	if (unlikely(ring->cur + ndwords > ring->end))
267 		fd_ringbuffer_grow(ring, ndwords);
268 }
269 
270 static inline void
OUT_PKT0(struct fd_ringbuffer * ring,uint16_t regindx,uint16_t cnt)271 OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
272 {
273 	BEGIN_RING(ring, cnt+1);
274 	OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
275 }
276 
277 static inline void
OUT_PKT2(struct fd_ringbuffer * ring)278 OUT_PKT2(struct fd_ringbuffer *ring)
279 {
280 	BEGIN_RING(ring, 1);
281 	OUT_RING(ring, CP_TYPE2_PKT);
282 }
283 
284 static inline void
OUT_PKT3(struct fd_ringbuffer * ring,uint8_t opcode,uint16_t cnt)285 OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
286 {
287 	BEGIN_RING(ring, cnt+1);
288 	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
289 }
290 
291 /*
292  * Starting with a5xx, pkt4/pkt7 are used instead of pkt0/pkt3
293  */
294 
295 static inline unsigned
_odd_parity_bit(unsigned val)296 _odd_parity_bit(unsigned val)
297 {
298 	/* See: http://graphics.stanford.edu/~seander/bithacks.html#ParityParallel
299 	 * note that we want odd parity so 0x6996 is inverted.
300 	 */
301 	val ^= val >> 16;
302 	val ^= val >> 8;
303 	val ^= val >> 4;
304 	val &= 0xf;
305 	return (~0x6996 >> val) & 1;
306 }
307 
308 static inline void
OUT_PKT4(struct fd_ringbuffer * ring,uint16_t regindx,uint16_t cnt)309 OUT_PKT4(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
310 {
311 	BEGIN_RING(ring, cnt+1);
312 	OUT_RING(ring, CP_TYPE4_PKT | cnt |
313 			(_odd_parity_bit(cnt) << 7) |
314 			((regindx & 0x3ffff) << 8) |
315 			((_odd_parity_bit(regindx) << 27)));
316 }
317 
318 static inline void
OUT_PKT7(struct fd_ringbuffer * ring,uint8_t opcode,uint16_t cnt)319 OUT_PKT7(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
320 {
321 	BEGIN_RING(ring, cnt+1);
322 	OUT_RING(ring, CP_TYPE7_PKT | cnt |
323 			(_odd_parity_bit(cnt) << 15) |
324 			((opcode & 0x7f) << 16) |
325 			((_odd_parity_bit(opcode) << 23)));
326 }
327 
328 static inline void
OUT_WFI(struct fd_ringbuffer * ring)329 OUT_WFI(struct fd_ringbuffer *ring)
330 {
331 	OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1);
332 	OUT_RING(ring, 0x00000000);
333 }
334 
335 static inline void
OUT_WFI5(struct fd_ringbuffer * ring)336 OUT_WFI5(struct fd_ringbuffer *ring)
337 {
338 	OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
339 }
340 
341 #endif /* FREEDRENO_RINGBUFFER_H_ */
342