1 /*
2  * Copyright 2010 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "pipe/p_defines.h"
24 
25 #include "compiler/nir/nir.h"
26 
27 #include "nv50/nv50_program.h"
28 #include "nv50/nv50_context.h"
29 
30 #include "codegen/nv50_ir_driver.h"
31 
32 static inline unsigned
bitcount4(const uint32_t val)33 bitcount4(const uint32_t val)
34 {
35    static const uint8_t cnt[16]
36    = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
37    return cnt[val & 0xf];
38 }
39 
40 static int
nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out * info)41 nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out *info)
42 {
43    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
44    unsigned i, n, c;
45 
46    n = 0;
47    for (i = 0; i < info->numInputs; ++i) {
48       prog->in[i].id = i;
49       prog->in[i].sn = info->in[i].sn;
50       prog->in[i].si = info->in[i].si;
51       prog->in[i].hw = n;
52       prog->in[i].mask = info->in[i].mask;
53 
54       prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
55 
56       for (c = 0; c < 4; ++c)
57          if (info->in[i].mask & (1 << c))
58             info->in[i].slot[c] = n++;
59 
60       if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
61          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
62    }
63    prog->in_nr = info->numInputs;
64 
65    for (i = 0; i < info->numSysVals; ++i) {
66       switch (info->sv[i].sn) {
67       case TGSI_SEMANTIC_INSTANCEID:
68          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
69          continue;
70       case TGSI_SEMANTIC_VERTEXID:
71          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
72          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
73          continue;
74       default:
75          break;
76       }
77    }
78 
79    /*
80     * Corner case: VP has no inputs, but we will still need to submit data to
81     * draw it. HW will shout at us and won't draw anything if we don't enable
82     * any input, so let's just pretend it's the first one.
83     */
84    if (prog->vp.attrs[0] == 0 &&
85        prog->vp.attrs[1] == 0 &&
86        prog->vp.attrs[2] == 0)
87       prog->vp.attrs[0] |= 0xf;
88 
89    /* VertexID before InstanceID */
90    if (info->io.vertexId < info->numSysVals)
91       info->sv[info->io.vertexId].slot[0] = n++;
92    if (info->io.instanceId < info->numSysVals)
93       info->sv[info->io.instanceId].slot[0] = n++;
94 
95    n = 0;
96    for (i = 0; i < info->numOutputs; ++i) {
97       switch (info->out[i].sn) {
98       case TGSI_SEMANTIC_PSIZE:
99          prog->vp.psiz = i;
100          break;
101       case TGSI_SEMANTIC_CLIPDIST:
102          prog->vp.clpd[info->out[i].si] = n;
103          break;
104       case TGSI_SEMANTIC_EDGEFLAG:
105          prog->vp.edgeflag = i;
106          break;
107       case TGSI_SEMANTIC_BCOLOR:
108          prog->vp.bfc[info->out[i].si] = i;
109          break;
110       case TGSI_SEMANTIC_LAYER:
111          prog->gp.has_layer = true;
112          prog->gp.layerid = n;
113          break;
114       case TGSI_SEMANTIC_VIEWPORT_INDEX:
115          prog->gp.has_viewport = true;
116          prog->gp.viewportid = n;
117          break;
118       default:
119          break;
120       }
121       prog->out[i].id = i;
122       prog->out[i].sn = info->out[i].sn;
123       prog->out[i].si = info->out[i].si;
124       prog->out[i].hw = n;
125       prog->out[i].mask = info->out[i].mask;
126 
127       for (c = 0; c < 4; ++c)
128          if (info->out[i].mask & (1 << c))
129             info->out[i].slot[c] = n++;
130    }
131    prog->out_nr = info->numOutputs;
132    prog->max_out = n;
133    if (!prog->max_out)
134       prog->max_out = 1;
135 
136    if (prog->vp.psiz < info->numOutputs)
137       prog->vp.psiz = prog->out[prog->vp.psiz].hw;
138 
139    return 0;
140 }
141 
142 static int
nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out * info)143 nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out *info)
144 {
145    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
146    unsigned i, n, m, c;
147    unsigned nvary;
148    unsigned nflat;
149    unsigned nintp = 0;
150 
151    /* count recorded non-flat inputs */
152    for (m = 0, i = 0; i < info->numInputs; ++i) {
153       switch (info->in[i].sn) {
154       case TGSI_SEMANTIC_POSITION:
155          continue;
156       default:
157          m += info->in[i].flat ? 0 : 1;
158          break;
159       }
160    }
161    /* careful: id may be != i in info->in[prog->in[i].id] */
162 
163    /* Fill prog->in[] so that non-flat inputs are first and
164     * kick out special inputs that don't use the RESULT_MAP.
165     */
166    for (n = 0, i = 0; i < info->numInputs; ++i) {
167       if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
168          prog->fp.interp |= info->in[i].mask << 24;
169          for (c = 0; c < 4; ++c)
170             if (info->in[i].mask & (1 << c))
171                info->in[i].slot[c] = nintp++;
172       } else {
173          unsigned j = info->in[i].flat ? m++ : n++;
174 
175          if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
176             prog->vp.bfc[info->in[i].si] = j;
177          else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
178             prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
179 
180          prog->in[j].id = i;
181          prog->in[j].mask = info->in[i].mask;
182          prog->in[j].sn = info->in[i].sn;
183          prog->in[j].si = info->in[i].si;
184          prog->in[j].linear = info->in[i].linear;
185 
186          prog->in_nr++;
187       }
188    }
189    if (!(prog->fp.interp & (8 << 24))) {
190       ++nintp;
191       prog->fp.interp |= 8 << 24;
192    }
193 
194    for (i = 0; i < prog->in_nr; ++i) {
195       int j = prog->in[i].id;
196 
197       prog->in[i].hw = nintp;
198       for (c = 0; c < 4; ++c)
199          if (prog->in[i].mask & (1 << c))
200             info->in[j].slot[c] = nintp++;
201    }
202    /* (n == m) if m never increased, i.e. no flat inputs */
203    nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
204    nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
205    nvary = nintp - nflat;
206 
207    prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
208    prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
209 
210    /* put front/back colors right after HPOS */
211    prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
212    for (i = 0; i < 2; ++i)
213       if (prog->vp.bfc[i] < 0xff)
214          prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
215 
216    /* FP outputs */
217 
218    if (info->prop.fp.numColourResults > 1)
219       prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
220 
221    for (i = 0; i < info->numOutputs; ++i) {
222       prog->out[i].id = i;
223       prog->out[i].sn = info->out[i].sn;
224       prog->out[i].si = info->out[i].si;
225       prog->out[i].mask = info->out[i].mask;
226 
227       if (i == info->io.fragDepth || i == info->io.sampleMask)
228          continue;
229       prog->out[i].hw = info->out[i].si * 4;
230 
231       for (c = 0; c < 4; ++c)
232          info->out[i].slot[c] = prog->out[i].hw + c;
233 
234       prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
235    }
236 
237    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
238       info->out[info->io.sampleMask].slot[0] = prog->max_out++;
239       prog->fp.has_samplemask = 1;
240    }
241 
242    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
243       info->out[info->io.fragDepth].slot[2] = prog->max_out++;
244 
245    if (!prog->max_out)
246       prog->max_out = 4;
247 
248    return 0;
249 }
250 
251 static int
nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out * info)252 nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
253 {
254    switch (info->type) {
255    case PIPE_SHADER_VERTEX:
256       return nv50_vertprog_assign_slots(info);
257    case PIPE_SHADER_GEOMETRY:
258       return nv50_vertprog_assign_slots(info);
259    case PIPE_SHADER_FRAGMENT:
260       return nv50_fragprog_assign_slots(info);
261    case PIPE_SHADER_COMPUTE:
262       return 0;
263    default:
264       return -1;
265    }
266 }
267 
268 static struct nv50_stream_output_state *
nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out * info,const struct pipe_stream_output_info * pso)269 nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out *info,
270                                   const struct pipe_stream_output_info *pso)
271 {
272    struct nv50_stream_output_state *so;
273    unsigned b, i, c;
274    unsigned base[4];
275 
276    so = MALLOC_STRUCT(nv50_stream_output_state);
277    if (!so)
278       return NULL;
279    memset(so->map, 0xff, sizeof(so->map));
280 
281    for (b = 0; b < 4; ++b)
282       so->num_attribs[b] = 0;
283    for (i = 0; i < pso->num_outputs; ++i) {
284       unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
285       b = pso->output[i].output_buffer;
286       assert(b < 4);
287       so->num_attribs[b] = MAX2(so->num_attribs[b], end);
288    }
289 
290    so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
291 
292    so->stride[0] = pso->stride[0] * 4;
293    base[0] = 0;
294    for (b = 1; b < 4; ++b) {
295       assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
296       so->stride[b] = so->num_attribs[b] * 4;
297       if (so->num_attribs[b])
298          so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
299       base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
300    }
301    if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
302       assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
303       so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
304    }
305 
306    so->map_size = base[3] + so->num_attribs[3];
307 
308    for (i = 0; i < pso->num_outputs; ++i) {
309       const unsigned s = pso->output[i].start_component;
310       const unsigned p = pso->output[i].dst_offset;
311       const unsigned r = pso->output[i].register_index;
312       b = pso->output[i].output_buffer;
313 
314       if (r >= info->numOutputs)
315          continue;
316 
317       for (c = 0; c < pso->output[i].num_components; ++c)
318          so->map[base[b] + p + c] = info->out[r].slot[s + c];
319    }
320 
321    return so;
322 }
323 
324 bool
nv50_program_translate(struct nv50_program * prog,uint16_t chipset,struct pipe_debug_callback * debug)325 nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
326                        struct pipe_debug_callback *debug)
327 {
328    struct nv50_ir_prog_info *info;
329    struct nv50_ir_prog_info_out info_out = {};
330    int i, ret;
331    const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
332 
333    info = CALLOC_STRUCT(nv50_ir_prog_info);
334    if (!info)
335       return false;
336 
337    info->type = prog->type;
338    info->target = chipset;
339 
340    info->bin.sourceRep = prog->pipe.type;
341    switch (prog->pipe.type) {
342    case PIPE_SHADER_IR_TGSI:
343       info->bin.source = (void *)prog->pipe.tokens;
344       break;
345    case PIPE_SHADER_IR_NIR:
346       info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir);
347       break;
348    default:
349       assert(!"unsupported IR!");
350       free(info);
351       return false;
352    }
353 
354    info->bin.smemSize = prog->cp.smem_size;
355    info->io.auxCBSlot = 15;
356    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
357    info->io.genUserClip = prog->vp.clpd_nr;
358    if (prog->fp.alphatest)
359       info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
360 
361    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
362    info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
363    info->io.msInfoCBSlot = 15;
364    info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
365 
366    info->assignSlots = nv50_program_assign_varying_slots;
367 
368    prog->vp.bfc[0] = 0xff;
369    prog->vp.bfc[1] = 0xff;
370    prog->vp.edgeflag = 0xff;
371    prog->vp.clpd[0] = map_undef;
372    prog->vp.clpd[1] = map_undef;
373    prog->vp.psiz = map_undef;
374    prog->gp.has_layer = 0;
375    prog->gp.has_viewport = 0;
376 
377    if (prog->type == PIPE_SHADER_COMPUTE)
378       info->prop.cp.inputOffset = 0x10;
379 
380    info_out.driverPriv = prog;
381 
382 #ifndef NDEBUG
383    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
384    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
385    info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
386 #else
387    info->optLevel = 3;
388 #endif
389 
390    ret = nv50_ir_generate_code(info, &info_out);
391    if (ret) {
392       NOUVEAU_ERR("shader translation failed: %i\n", ret);
393       goto out;
394    }
395 
396    prog->code = info_out.bin.code;
397    prog->code_size = info_out.bin.codeSize;
398    prog->fixups = info_out.bin.relocData;
399    prog->interps = info_out.bin.fixupData;
400    prog->max_gpr = MAX2(4, (info_out.bin.maxGPR >> 1) + 1);
401    prog->tls_space = info_out.bin.tlsSpace;
402    prog->cp.smem_size = info_out.bin.smemSize;
403    prog->mul_zero_wins = info->io.mul_zero_wins;
404    prog->vp.need_vertex_id = info_out.io.vertexId < PIPE_MAX_SHADER_INPUTS;
405 
406    prog->vp.clip_enable = (1 << info_out.io.clipDistances) - 1;
407    prog->vp.cull_enable =
408       ((1 << info_out.io.cullDistances) - 1) << info_out.io.clipDistances;
409    prog->vp.clip_mode = 0;
410    for (i = 0; i < info_out.io.cullDistances; ++i)
411       prog->vp.clip_mode |= 1 << ((info_out.io.clipDistances + i) * 4);
412 
413    if (prog->type == PIPE_SHADER_FRAGMENT) {
414       if (info_out.prop.fp.writesDepth) {
415          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
416          prog->fp.flags[1] = 0x11;
417       }
418       if (info_out.prop.fp.usesDiscard)
419          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
420    } else
421    if (prog->type == PIPE_SHADER_GEOMETRY) {
422       switch (info_out.prop.gp.outputPrim) {
423       case PIPE_PRIM_LINE_STRIP:
424          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
425          break;
426       case PIPE_PRIM_TRIANGLE_STRIP:
427          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
428          break;
429       case PIPE_PRIM_POINTS:
430       default:
431          assert(info_out.prop.gp.outputPrim == PIPE_PRIM_POINTS);
432          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
433          break;
434       }
435       prog->gp.vert_count = CLAMP(info_out.prop.gp.maxVertices, 1, 1024);
436    }
437 
438    if (prog->pipe.stream_output.num_outputs)
439       prog->so = nv50_program_create_strmout_state(&info_out,
440                                                    &prog->pipe.stream_output);
441 
442    pipe_debug_message(debug, SHADER_INFO,
443                       "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
444                       prog->type, info_out.bin.tlsSpace, info_out.bin.smemSize,
445                       prog->max_gpr, info_out.bin.instructions,
446                       info_out.bin.codeSize);
447 
448 out:
449    if (info->bin.sourceRep == PIPE_SHADER_IR_NIR)
450       ralloc_free((void *)info->bin.source);
451    FREE(info);
452    return !ret;
453 }
454 
455 bool
nv50_program_upload_code(struct nv50_context * nv50,struct nv50_program * prog)456 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
457 {
458    struct nouveau_heap *heap;
459    int ret;
460    uint32_t size = align(prog->code_size, 0x40);
461    uint8_t prog_type;
462 
463    switch (prog->type) {
464    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
465    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
466    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
467    case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
468    default:
469       assert(!"invalid program type");
470       return false;
471    }
472 
473    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
474    if (ret) {
475       /* Out of space: evict everything to compactify the code segment, hoping
476        * the working set is much smaller and drifts slowly. Improve me !
477        */
478       while (heap->next) {
479          struct nv50_program *evict = heap->next->priv;
480          if (evict)
481             nouveau_heap_free(&evict->mem);
482       }
483       debug_printf("WARNING: out of code space, evicting all shaders.\n");
484       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
485       if (ret) {
486          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
487          return false;
488       }
489    }
490 
491    if (prog->type == PIPE_SHADER_COMPUTE) {
492       /* CP code must be uploaded in FP code segment. */
493       prog_type = 1;
494    } else {
495       prog->code_base = prog->mem->start;
496       prog_type = prog->type;
497    }
498 
499    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
500    if (ret < 0) {
501       nouveau_heap_free(&prog->mem);
502       return false;
503    }
504    if (ret > 0)
505       nv50->state.new_tls_space = true;
506 
507    if (prog->fixups)
508       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
509    if (prog->interps)
510       nv50_ir_apply_fixups(prog->interps, prog->code,
511                            prog->fp.force_persample_interp,
512                            false /* flatshade */,
513                            prog->fp.alphatest - 1);
514 
515    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
516                        (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
517                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
518 
519    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
520    PUSH_DATA (nv50->base.pushbuf, 0);
521 
522    return true;
523 }
524 
525 void
nv50_program_destroy(struct nv50_context * nv50,struct nv50_program * p)526 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
527 {
528    const struct pipe_shader_state pipe = p->pipe;
529    const ubyte type = p->type;
530 
531    if (p->mem)
532       nouveau_heap_free(&p->mem);
533 
534    FREE(p->code);
535 
536    FREE(p->fixups);
537    FREE(p->interps);
538    FREE(p->so);
539 
540    memset(p, 0, sizeof(*p));
541 
542    p->pipe = pipe;
543    p->type = type;
544 }
545