1 /*
2  * Copyright 2013 Nouveau Project
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * Authors: Christoph Bumiller, Samuel Pitoiset
23  */
24 
25 #include "nvc0/nvc0_context.h"
26 
27 #include "nvc0/nvc0_compute.xml.h"
28 
29 int
nvc0_screen_compute_setup(struct nvc0_screen * screen,struct nouveau_pushbuf * push)30 nvc0_screen_compute_setup(struct nvc0_screen *screen,
31                           struct nouveau_pushbuf *push)
32 {
33    struct nouveau_object *chan = screen->base.channel;
34    struct nouveau_device *dev = screen->base.device;
35    uint32_t obj_class;
36    int ret;
37    int i;
38 
39    switch (dev->chipset & ~0xf) {
40    case 0xc0:
41    case 0xd0:
42       /* In theory, GF110+ should also support NVC8_COMPUTE_CLASS but,
43        * in practice, a ILLEGAL_CLASS dmesg fail appears when using it. */
44       obj_class = NVC0_COMPUTE_CLASS;
45       break;
46    default:
47       NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
48       return -1;
49    }
50 
51    ret = nouveau_object_new(chan, 0xbeef90c0, obj_class, NULL, 0,
52                             &screen->compute);
53    if (ret) {
54       NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
55       return ret;
56    }
57 
58    BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
59    PUSH_DATA (push, screen->compute->oclass);
60 
61    /* hardware limit */
62    BEGIN_NVC0(push, NVC0_CP(MP_LIMIT), 1);
63    PUSH_DATA (push, screen->mp_count);
64    BEGIN_NVC0(push, NVC0_CP(CALL_LIMIT_LOG), 1);
65    PUSH_DATA (push, 0xf);
66 
67    BEGIN_NVC0(push, SUBC_CP(0x02a0), 1);
68    PUSH_DATA (push, 0x8000);
69 
70    /* global memory setup */
71    BEGIN_NVC0(push, SUBC_CP(0x02c4), 1);
72    PUSH_DATA (push, 0);
73    BEGIN_NIC0(push, NVC0_CP(GLOBAL_BASE), 0x100);
74    for (i = 0; i <= 0xff; i++)
75       PUSH_DATA (push, (0xc << 28) | (i << 16) | i);
76    BEGIN_NVC0(push, SUBC_CP(0x02c4), 1);
77    PUSH_DATA (push, 1);
78 
79    /* local memory and cstack setup */
80    BEGIN_NVC0(push, NVC0_CP(TEMP_ADDRESS_HIGH), 2);
81    PUSH_DATAh(push, screen->tls->offset);
82    PUSH_DATA (push, screen->tls->offset);
83    BEGIN_NVC0(push, NVC0_CP(TEMP_SIZE_HIGH), 2);
84    PUSH_DATAh(push, screen->tls->size);
85    PUSH_DATA (push, screen->tls->size);
86    BEGIN_NVC0(push, NVC0_CP(WARP_TEMP_ALLOC), 1);
87    PUSH_DATA (push, 0);
88    BEGIN_NVC0(push, NVC0_CP(LOCAL_BASE), 1);
89    PUSH_DATA (push, 0xff << 24);
90 
91    /* shared memory setup */
92    BEGIN_NVC0(push, NVC0_CP(CACHE_SPLIT), 1);
93    PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1);
94    BEGIN_NVC0(push, NVC0_CP(SHARED_BASE), 1);
95    PUSH_DATA (push, 0xfe << 24);
96    BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 1);
97    PUSH_DATA (push, 0);
98 
99    /* code segment setup */
100    BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2);
101    PUSH_DATAh(push, screen->text->offset);
102    PUSH_DATA (push, screen->text->offset);
103 
104    /* textures */
105    BEGIN_NVC0(push, NVC0_CP(TIC_ADDRESS_HIGH), 3);
106    PUSH_DATAh(push, screen->txc->offset);
107    PUSH_DATA (push, screen->txc->offset);
108    PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
109 
110    /* samplers */
111    BEGIN_NVC0(push, NVC0_CP(TSC_ADDRESS_HIGH), 3);
112    PUSH_DATAh(push, screen->txc->offset + 65536);
113    PUSH_DATA (push, screen->txc->offset + 65536);
114    PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
115 
116    /* MS sample coordinate offsets */
117    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
118    PUSH_DATA (push, NVC0_CB_AUX_SIZE);
119    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
120    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
121    BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 2 * 8);
122    PUSH_DATA (push, NVC0_CB_AUX_MS_INFO);
123    PUSH_DATA (push, 0); /* 0 */
124    PUSH_DATA (push, 0);
125    PUSH_DATA (push, 1); /* 1 */
126    PUSH_DATA (push, 0);
127    PUSH_DATA (push, 0); /* 2 */
128    PUSH_DATA (push, 1);
129    PUSH_DATA (push, 1); /* 3 */
130    PUSH_DATA (push, 1);
131    PUSH_DATA (push, 2); /* 4 */
132    PUSH_DATA (push, 0);
133    PUSH_DATA (push, 3); /* 5 */
134    PUSH_DATA (push, 0);
135    PUSH_DATA (push, 2); /* 6 */
136    PUSH_DATA (push, 1);
137    PUSH_DATA (push, 3); /* 7 */
138    PUSH_DATA (push, 1);
139 
140    return 0;
141 }
142 
143 static void
nvc0_compute_validate_samplers(struct nvc0_context * nvc0)144 nvc0_compute_validate_samplers(struct nvc0_context *nvc0)
145 {
146    bool need_flush = nvc0_validate_tsc(nvc0, 5);
147    if (need_flush) {
148       BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TSC_FLUSH), 1);
149       PUSH_DATA (nvc0->base.pushbuf, 0);
150    }
151 
152    /* Invalidate all 3D samplers because they are aliased. */
153    for (int s = 0; s < 5; s++)
154       nvc0->samplers_dirty[s] = ~0;
155    nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
156 }
157 
158 static void
nvc0_compute_validate_textures(struct nvc0_context * nvc0)159 nvc0_compute_validate_textures(struct nvc0_context *nvc0)
160 {
161    bool need_flush = nvc0_validate_tic(nvc0, 5);
162    if (need_flush) {
163       BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(TIC_FLUSH), 1);
164       PUSH_DATA (nvc0->base.pushbuf, 0);
165    }
166 
167    /* Invalidate all 3D textures because they are aliased. */
168    for (int s = 0; s < 5; s++) {
169       for (int i = 0; i < nvc0->num_textures[s]; i++)
170          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
171       nvc0->textures_dirty[s] = ~0;
172    }
173    nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
174 }
175 
176 static inline void
nvc0_compute_invalidate_constbufs(struct nvc0_context * nvc0)177 nvc0_compute_invalidate_constbufs(struct nvc0_context *nvc0)
178 {
179    int s;
180 
181    /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */
182    for (s = 0; s < 5; s++) {
183       nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s];
184       nvc0->state.uniform_buffer_bound[s] = false;
185    }
186    nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF;
187 }
188 
189 static void
nvc0_compute_validate_constbufs(struct nvc0_context * nvc0)190 nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
191 {
192    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
193    const int s = 5;
194 
195    while (nvc0->constbuf_dirty[s]) {
196       int i = ffs(nvc0->constbuf_dirty[s]) - 1;
197       nvc0->constbuf_dirty[s] &= ~(1 << i);
198 
199       if (nvc0->constbuf[s][i].user) {
200          struct nouveau_bo *bo = nvc0->screen->uniform_bo;
201          const unsigned base = NVC0_CB_USR_INFO(s);
202          const unsigned size = nvc0->constbuf[s][0].size;
203          assert(i == 0); /* we really only want OpenGL uniforms here */
204          assert(nvc0->constbuf[s][0].u.data);
205 
206          if (!nvc0->state.uniform_buffer_bound[s]) {
207             nvc0->state.uniform_buffer_bound[s] = true;
208 
209             BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
210             PUSH_DATA (push, NVC0_MAX_CONSTBUF_SIZE);
211             PUSH_DATAh(push, bo->offset + base);
212             PUSH_DATA (push, bo->offset + base);
213             BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
214             PUSH_DATA (push, (0 << 8) | 1);
215          }
216          nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
217                          base, NVC0_MAX_CONSTBUF_SIZE, 0, (size + 3) / 4,
218                          nvc0->constbuf[s][0].u.data);
219       } else {
220          struct nv04_resource *res =
221             nv04_resource(nvc0->constbuf[s][i].u.buf);
222          if (res) {
223             BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
224             PUSH_DATA (push, nvc0->constbuf[s][i].size);
225             PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
226             PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
227             BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
228             PUSH_DATA (push, (i << 8) | 1);
229 
230             BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
231 
232             res->cb_bindings[s] |= 1 << i;
233          } else {
234             BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
235             PUSH_DATA (push, (i << 8) | 0);
236          }
237          if (i == 0)
238             nvc0->state.uniform_buffer_bound[s] = false;
239       }
240    }
241 
242    nvc0_compute_invalidate_constbufs(nvc0);
243 
244    BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
245    PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
246 }
247 
248 static void
nvc0_compute_validate_driverconst(struct nvc0_context * nvc0)249 nvc0_compute_validate_driverconst(struct nvc0_context *nvc0)
250 {
251    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
252    struct nvc0_screen *screen = nvc0->screen;
253 
254    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
255    PUSH_DATA (push, NVC0_CB_AUX_SIZE);
256    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
257    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
258    BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
259    PUSH_DATA (push, (15 << 8) | 1);
260 
261    nvc0->dirty_3d |= NVC0_NEW_3D_DRIVERCONST;
262 }
263 
264 static void
nvc0_compute_validate_buffers(struct nvc0_context * nvc0)265 nvc0_compute_validate_buffers(struct nvc0_context *nvc0)
266 {
267    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
268    struct nvc0_screen *screen = nvc0->screen;
269    const int s = 5;
270    int i;
271 
272    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
273    PUSH_DATA (push, NVC0_CB_AUX_SIZE);
274    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
275    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
276    BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
277    PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
278 
279    for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
280       if (nvc0->buffers[s][i].buffer) {
281          struct nv04_resource *res =
282             nv04_resource(nvc0->buffers[s][i].buffer);
283          PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
284          PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
285          PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
286          PUSH_DATA (push, 0);
287          BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
288          util_range_add(&res->base, &res->valid_buffer_range,
289                         nvc0->buffers[s][i].buffer_offset,
290                         nvc0->buffers[s][i].buffer_offset +
291                         nvc0->buffers[s][i].buffer_size);
292       } else {
293          PUSH_DATA (push, 0);
294          PUSH_DATA (push, 0);
295          PUSH_DATA (push, 0);
296          PUSH_DATA (push, 0);
297       }
298    }
299 }
300 
301 void
nvc0_compute_validate_globals(struct nvc0_context * nvc0)302 nvc0_compute_validate_globals(struct nvc0_context *nvc0)
303 {
304    unsigned i;
305 
306    for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
307         ++i) {
308       struct pipe_resource *res = *util_dynarray_element(
309          &nvc0->global_residents, struct pipe_resource *, i);
310       if (res)
311          nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL,
312                            nv04_resource(res), NOUVEAU_BO_RDWR);
313    }
314 }
315 
316 static inline void
nvc0_compute_invalidate_surfaces(struct nvc0_context * nvc0,const int s)317 nvc0_compute_invalidate_surfaces(struct nvc0_context *nvc0, const int s)
318 {
319    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
320    int i;
321 
322    for (i = 0; i < NVC0_MAX_IMAGES; ++i) {
323       if (s == 5)
324          BEGIN_NVC0(push, NVC0_CP(IMAGE(i)), 6);
325       else
326          BEGIN_NVC0(push, NVC0_3D(IMAGE(i)), 6);
327       PUSH_DATA(push, 0);
328       PUSH_DATA(push, 0);
329       PUSH_DATA(push, 0);
330       PUSH_DATA(push, 0);
331       PUSH_DATA(push, 0x14000);
332       PUSH_DATA(push, 0);
333    }
334 }
335 
336 static void
nvc0_compute_validate_surfaces(struct nvc0_context * nvc0)337 nvc0_compute_validate_surfaces(struct nvc0_context *nvc0)
338 {
339    /* TODO: Invalidating both 3D and CP surfaces before validating surfaces for
340     * compute is probably not really necessary, but we didn't find any better
341     * solutions for now. This fixes some invalidation issues when compute and
342     * fragment shaders are used inside the same context. Anyway, we definitely
343     * have invalidation issues between 3D and CP for other resources like SSBO
344     * and atomic counters. */
345    nvc0_compute_invalidate_surfaces(nvc0, 4);
346    nvc0_compute_invalidate_surfaces(nvc0, 5);
347 
348    nvc0_validate_suf(nvc0, 5);
349 
350    /* Invalidate all FRAGMENT images because they are aliased with COMPUTE. */
351    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_SUF);
352    nvc0->dirty_3d |= NVC0_NEW_3D_SURFACES;
353    nvc0->images_dirty[4] |= nvc0->images_valid[4];
354 }
355 
356 static struct nvc0_state_validate
357 validate_list_cp[] = {
358    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
359    { nvc0_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
360    { nvc0_compute_validate_driverconst,   NVC0_NEW_CP_DRIVERCONST },
361    { nvc0_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
362    { nvc0_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
363    { nvc0_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
364    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
365    { nvc0_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
366 };
367 
368 static bool
nvc0_state_validate_cp(struct nvc0_context * nvc0,uint32_t mask)369 nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
370 {
371    bool ret;
372 
373    ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
374                              ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
375                              nvc0->bufctx_cp);
376 
377    if (unlikely(nvc0->state.flushed))
378       nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
379    return ret;
380 }
381 
382 static void
nvc0_compute_upload_input(struct nvc0_context * nvc0,const struct pipe_grid_info * info)383 nvc0_compute_upload_input(struct nvc0_context *nvc0,
384                           const struct pipe_grid_info *info)
385 {
386    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
387    struct nvc0_screen *screen = nvc0->screen;
388    struct nvc0_program *cp = nvc0->compprog;
389 
390    if (cp->parm_size) {
391       struct nouveau_bo *bo = screen->uniform_bo;
392       const unsigned base = NVC0_CB_USR_INFO(5);
393 
394       BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
395       PUSH_DATA (push, align(cp->parm_size, 0x100));
396       PUSH_DATAh(push, bo->offset + base);
397       PUSH_DATA (push, bo->offset + base);
398       BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
399       PUSH_DATA (push, (0 << 8) | 1);
400       /* NOTE: size is limited to 4 KiB, which is < NV04_PFIFO_MAX_PACKET_LEN */
401       BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + cp->parm_size / 4);
402       PUSH_DATA (push, 0);
403       PUSH_DATAp(push, info->input, cp->parm_size / 4);
404 
405       nvc0_compute_invalidate_constbufs(nvc0);
406    }
407 
408    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
409    PUSH_DATA (push, NVC0_CB_AUX_SIZE);
410    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
411    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
412 
413    BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 1);
414    /* (7) as we only upload work_dim on nvc0, the rest uses special regs */
415    PUSH_DATA (push, NVC0_CB_AUX_GRID_INFO(7));
416    PUSH_DATA (push, info->work_dim);
417 
418    BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
419    PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
420 }
421 
422 void
nvc0_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)423 nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
424 {
425    struct nvc0_context *nvc0 = nvc0_context(pipe);
426    struct nvc0_screen *screen = nvc0->screen;
427    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
428    struct nvc0_program *cp = nvc0->compprog;
429    int ret;
430 
431    ret = !nvc0_state_validate_cp(nvc0, ~0);
432    if (ret) {
433       NOUVEAU_ERR("Failed to launch grid !\n");
434       return;
435    }
436 
437    nvc0_compute_upload_input(nvc0, info);
438 
439    BEGIN_NVC0(push, NVC0_CP(CP_START_ID), 1);
440    PUSH_DATA (push, cp->code_base);
441 
442    BEGIN_NVC0(push, NVC0_CP(LOCAL_POS_ALLOC), 3);
443    PUSH_DATA (push, (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10));
444    PUSH_DATA (push, 0);
445    PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */
446 
447    BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 3);
448    PUSH_DATA (push, align(cp->cp.smem_size, 0x100));
449    PUSH_DATA (push, info->block[0] * info->block[1] * info->block[2]);
450    PUSH_DATA (push, cp->num_barriers);
451    BEGIN_NVC0(push, NVC0_CP(CP_GPR_ALLOC), 1);
452    PUSH_DATA (push, cp->num_gprs);
453 
454    /* launch preliminary setup */
455    BEGIN_NVC0(push, NVC0_CP(GRIDID), 1);
456    PUSH_DATA (push, 0x1);
457    BEGIN_NVC0(push, SUBC_CP(0x036c), 1);
458    PUSH_DATA (push, 0);
459    BEGIN_NVC0(push, NVC0_CP(FLUSH), 1);
460    PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8);
461 
462    /* block setup */
463    BEGIN_NVC0(push, NVC0_CP(BLOCKDIM_YX), 2);
464    PUSH_DATA (push, (info->block[1] << 16) | info->block[0]);
465    PUSH_DATA (push, info->block[2]);
466 
467    nouveau_pushbuf_space(push, 32, 2, 1);
468    PUSH_REFN(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD);
469 
470    if (unlikely(info->indirect)) {
471       struct nv04_resource *res = nv04_resource(info->indirect);
472       uint32_t offset = res->offset + info->indirect_offset;
473       unsigned macro = NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT;
474 
475       PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
476       PUSH_DATA(push, NVC0_FIFO_PKHDR_1I(1, macro, 3));
477       nouveau_pushbuf_data(push, res->bo, offset,
478                            NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
479    } else {
480       /* grid setup */
481       BEGIN_NVC0(push, NVC0_CP(GRIDDIM_YX), 2);
482       PUSH_DATA (push, (info->grid[1] << 16) | info->grid[0]);
483       PUSH_DATA (push, info->grid[2]);
484 
485       /* kernel launching */
486       BEGIN_NVC0(push, NVC0_CP(COMPUTE_BEGIN), 1);
487       PUSH_DATA (push, 0);
488       BEGIN_NVC0(push, SUBC_CP(0x0a08), 1);
489       PUSH_DATA (push, 0);
490       BEGIN_NVC0(push, NVC0_CP(LAUNCH), 1);
491       PUSH_DATA (push, 0x1000);
492       BEGIN_NVC0(push, NVC0_CP(COMPUTE_END), 1);
493       PUSH_DATA (push, 0);
494       BEGIN_NVC0(push, SUBC_CP(0x0360), 1);
495       PUSH_DATA (push, 0x1);
496    }
497 
498    /* TODO: Not sure if this is really necessary. */
499    nvc0_compute_invalidate_surfaces(nvc0, 5);
500    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
501    nvc0->dirty_cp |= NVC0_NEW_CP_SURFACES;
502    nvc0->images_dirty[5] |= nvc0->images_valid[5];
503 
504    nvc0_update_compute_invocations_counter(nvc0, info);
505 }
506 
507 static void
nvc0_compute_update_indirect_invocations(struct nvc0_context * nvc0,const struct pipe_grid_info * info)508 nvc0_compute_update_indirect_invocations(struct nvc0_context *nvc0,
509                                          const struct pipe_grid_info *info) {
510    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
511    struct nv04_resource *res = nv04_resource(info->indirect);
512    uint32_t offset = res->offset + info->indirect_offset;
513 
514    nouveau_pushbuf_space(push, 16, 0, 8);
515    PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
516    BEGIN_1IC0(push, NVC0_3D(MACRO_COMPUTE_COUNTER), 7);
517    PUSH_DATA(push, 6);
518    PUSH_DATA(push, info->block[0]);
519    PUSH_DATA(push, info->block[1]);
520    PUSH_DATA(push, info->block[2]);
521    nouveau_pushbuf_data(push, res->bo, offset,
522                         NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
523 }
524 
525 void
nvc0_update_compute_invocations_counter(struct nvc0_context * nvc0,const struct pipe_grid_info * info)526 nvc0_update_compute_invocations_counter(struct nvc0_context *nvc0,
527                                         const struct pipe_grid_info *info) {
528    if (unlikely(info->indirect)) {
529       nvc0_compute_update_indirect_invocations(nvc0, info);
530    } else {
531       uint64_t invocations = info->block[0] * info->block[1] * info->block[2];
532       invocations *= info->grid[0] * info->grid[1] * info->grid[2];
533       nvc0->compute_invocations += invocations;
534    }
535 }
536