1 /*
2  * Copyright 2010 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "pipe/p_defines.h"
24 
25 #include "nv50/nv50_program.h"
26 #include "nv50/nv50_context.h"
27 
28 #include "codegen/nv50_ir_driver.h"
29 
30 static inline unsigned
bitcount4(const uint32_t val)31 bitcount4(const uint32_t val)
32 {
33    static const uint8_t cnt[16]
34    = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
35    return cnt[val & 0xf];
36 }
37 
38 static int
nv50_vertprog_assign_slots(struct nv50_ir_prog_info * info)39 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
40 {
41    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
42    unsigned i, n, c;
43 
44    n = 0;
45    for (i = 0; i < info->numInputs; ++i) {
46       prog->in[i].id = i;
47       prog->in[i].sn = info->in[i].sn;
48       prog->in[i].si = info->in[i].si;
49       prog->in[i].hw = n;
50       prog->in[i].mask = info->in[i].mask;
51 
52       prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
53 
54       for (c = 0; c < 4; ++c)
55          if (info->in[i].mask & (1 << c))
56             info->in[i].slot[c] = n++;
57 
58       if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
59          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
60    }
61    prog->in_nr = info->numInputs;
62 
63    for (i = 0; i < info->numSysVals; ++i) {
64       switch (info->sv[i].sn) {
65       case TGSI_SEMANTIC_INSTANCEID:
66          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
67          continue;
68       case TGSI_SEMANTIC_VERTEXID:
69          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
70          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
71          continue;
72       default:
73          break;
74       }
75    }
76 
77    /*
78     * Corner case: VP has no inputs, but we will still need to submit data to
79     * draw it. HW will shout at us and won't draw anything if we don't enable
80     * any input, so let's just pretend it's the first one.
81     */
82    if (prog->vp.attrs[0] == 0 &&
83        prog->vp.attrs[1] == 0 &&
84        prog->vp.attrs[2] == 0)
85       prog->vp.attrs[0] |= 0xf;
86 
87    /* VertexID before InstanceID */
88    if (info->io.vertexId < info->numSysVals)
89       info->sv[info->io.vertexId].slot[0] = n++;
90    if (info->io.instanceId < info->numSysVals)
91       info->sv[info->io.instanceId].slot[0] = n++;
92 
93    n = 0;
94    for (i = 0; i < info->numOutputs; ++i) {
95       switch (info->out[i].sn) {
96       case TGSI_SEMANTIC_PSIZE:
97          prog->vp.psiz = i;
98          break;
99       case TGSI_SEMANTIC_CLIPDIST:
100          prog->vp.clpd[info->out[i].si] = n;
101          break;
102       case TGSI_SEMANTIC_EDGEFLAG:
103          prog->vp.edgeflag = i;
104          break;
105       case TGSI_SEMANTIC_BCOLOR:
106          prog->vp.bfc[info->out[i].si] = i;
107          break;
108       case TGSI_SEMANTIC_LAYER:
109          prog->gp.has_layer = true;
110          prog->gp.layerid = n;
111          break;
112       case TGSI_SEMANTIC_VIEWPORT_INDEX:
113          prog->gp.has_viewport = true;
114          prog->gp.viewportid = n;
115          break;
116       default:
117          break;
118       }
119       prog->out[i].id = i;
120       prog->out[i].sn = info->out[i].sn;
121       prog->out[i].si = info->out[i].si;
122       prog->out[i].hw = n;
123       prog->out[i].mask = info->out[i].mask;
124 
125       for (c = 0; c < 4; ++c)
126          if (info->out[i].mask & (1 << c))
127             info->out[i].slot[c] = n++;
128    }
129    prog->out_nr = info->numOutputs;
130    prog->max_out = n;
131    if (!prog->max_out)
132       prog->max_out = 1;
133 
134    if (prog->vp.psiz < info->numOutputs)
135       prog->vp.psiz = prog->out[prog->vp.psiz].hw;
136 
137    return 0;
138 }
139 
140 static int
nv50_fragprog_assign_slots(struct nv50_ir_prog_info * info)141 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
142 {
143    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
144    unsigned i, n, m, c;
145    unsigned nvary;
146    unsigned nflat;
147    unsigned nintp = 0;
148 
149    /* count recorded non-flat inputs */
150    for (m = 0, i = 0; i < info->numInputs; ++i) {
151       switch (info->in[i].sn) {
152       case TGSI_SEMANTIC_POSITION:
153          continue;
154       default:
155          m += info->in[i].flat ? 0 : 1;
156          break;
157       }
158    }
159    /* careful: id may be != i in info->in[prog->in[i].id] */
160 
161    /* Fill prog->in[] so that non-flat inputs are first and
162     * kick out special inputs that don't use the RESULT_MAP.
163     */
164    for (n = 0, i = 0; i < info->numInputs; ++i) {
165       if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
166          prog->fp.interp |= info->in[i].mask << 24;
167          for (c = 0; c < 4; ++c)
168             if (info->in[i].mask & (1 << c))
169                info->in[i].slot[c] = nintp++;
170       } else {
171          unsigned j = info->in[i].flat ? m++ : n++;
172 
173          if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
174             prog->vp.bfc[info->in[i].si] = j;
175          else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
176             prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
177 
178          prog->in[j].id = i;
179          prog->in[j].mask = info->in[i].mask;
180          prog->in[j].sn = info->in[i].sn;
181          prog->in[j].si = info->in[i].si;
182          prog->in[j].linear = info->in[i].linear;
183 
184          prog->in_nr++;
185       }
186    }
187    if (!(prog->fp.interp & (8 << 24))) {
188       ++nintp;
189       prog->fp.interp |= 8 << 24;
190    }
191 
192    for (i = 0; i < prog->in_nr; ++i) {
193       int j = prog->in[i].id;
194 
195       prog->in[i].hw = nintp;
196       for (c = 0; c < 4; ++c)
197          if (prog->in[i].mask & (1 << c))
198             info->in[j].slot[c] = nintp++;
199    }
200    /* (n == m) if m never increased, i.e. no flat inputs */
201    nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
202    nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
203    nvary = nintp - nflat;
204 
205    prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
206    prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
207 
208    /* put front/back colors right after HPOS */
209    prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
210    for (i = 0; i < 2; ++i)
211       if (prog->vp.bfc[i] < 0xff)
212          prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
213 
214    /* FP outputs */
215 
216    if (info->prop.fp.numColourResults > 1)
217       prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
218 
219    for (i = 0; i < info->numOutputs; ++i) {
220       prog->out[i].id = i;
221       prog->out[i].sn = info->out[i].sn;
222       prog->out[i].si = info->out[i].si;
223       prog->out[i].mask = info->out[i].mask;
224 
225       if (i == info->io.fragDepth || i == info->io.sampleMask)
226          continue;
227       prog->out[i].hw = info->out[i].si * 4;
228 
229       for (c = 0; c < 4; ++c)
230          info->out[i].slot[c] = prog->out[i].hw + c;
231 
232       prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
233    }
234 
235    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
236       info->out[info->io.sampleMask].slot[0] = prog->max_out++;
237       prog->fp.has_samplemask = 1;
238    }
239 
240    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
241       info->out[info->io.fragDepth].slot[2] = prog->max_out++;
242 
243    if (!prog->max_out)
244       prog->max_out = 4;
245 
246    return 0;
247 }
248 
249 static int
nv50_program_assign_varying_slots(struct nv50_ir_prog_info * info)250 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
251 {
252    switch (info->type) {
253    case PIPE_SHADER_VERTEX:
254       return nv50_vertprog_assign_slots(info);
255    case PIPE_SHADER_GEOMETRY:
256       return nv50_vertprog_assign_slots(info);
257    case PIPE_SHADER_FRAGMENT:
258       return nv50_fragprog_assign_slots(info);
259    case PIPE_SHADER_COMPUTE:
260       return 0;
261    default:
262       return -1;
263    }
264 }
265 
266 static struct nv50_stream_output_state *
nv50_program_create_strmout_state(const struct nv50_ir_prog_info * info,const struct pipe_stream_output_info * pso)267 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
268                                   const struct pipe_stream_output_info *pso)
269 {
270    struct nv50_stream_output_state *so;
271    unsigned b, i, c;
272    unsigned base[4];
273 
274    so = MALLOC_STRUCT(nv50_stream_output_state);
275    if (!so)
276       return NULL;
277    memset(so->map, 0xff, sizeof(so->map));
278 
279    for (b = 0; b < 4; ++b)
280       so->num_attribs[b] = 0;
281    for (i = 0; i < pso->num_outputs; ++i) {
282       unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
283       b = pso->output[i].output_buffer;
284       assert(b < 4);
285       so->num_attribs[b] = MAX2(so->num_attribs[b], end);
286    }
287 
288    so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
289 
290    so->stride[0] = pso->stride[0] * 4;
291    base[0] = 0;
292    for (b = 1; b < 4; ++b) {
293       assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
294       so->stride[b] = so->num_attribs[b] * 4;
295       if (so->num_attribs[b])
296          so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
297       base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
298    }
299    if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
300       assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
301       so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
302    }
303 
304    so->map_size = base[3] + so->num_attribs[3];
305 
306    for (i = 0; i < pso->num_outputs; ++i) {
307       const unsigned s = pso->output[i].start_component;
308       const unsigned p = pso->output[i].dst_offset;
309       const unsigned r = pso->output[i].register_index;
310       b = pso->output[i].output_buffer;
311 
312       if (r >= info->numOutputs)
313          continue;
314 
315       for (c = 0; c < pso->output[i].num_components; ++c)
316          so->map[base[b] + p + c] = info->out[r].slot[s + c];
317    }
318 
319    return so;
320 }
321 
322 bool
nv50_program_translate(struct nv50_program * prog,uint16_t chipset,struct pipe_debug_callback * debug)323 nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
324                        struct pipe_debug_callback *debug)
325 {
326    struct nv50_ir_prog_info *info;
327    int i, ret;
328    const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
329 
330    info = CALLOC_STRUCT(nv50_ir_prog_info);
331    if (!info)
332       return false;
333 
334    info->type = prog->type;
335    info->target = chipset;
336    info->bin.sourceRep = PIPE_SHADER_IR_TGSI;
337    info->bin.source = (void *)prog->pipe.tokens;
338 
339    info->bin.smemSize = prog->cp.smem_size;
340    info->io.auxCBSlot = 15;
341    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
342    info->io.genUserClip = prog->vp.clpd_nr;
343    if (prog->fp.alphatest)
344       info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
345 
346    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
347    info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
348    info->io.msInfoCBSlot = 15;
349    info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
350 
351    info->assignSlots = nv50_program_assign_varying_slots;
352 
353    prog->vp.bfc[0] = 0xff;
354    prog->vp.bfc[1] = 0xff;
355    prog->vp.edgeflag = 0xff;
356    prog->vp.clpd[0] = map_undef;
357    prog->vp.clpd[1] = map_undef;
358    prog->vp.psiz = map_undef;
359    prog->gp.has_layer = 0;
360    prog->gp.has_viewport = 0;
361 
362    if (prog->type == PIPE_SHADER_COMPUTE)
363       info->prop.cp.inputOffset = 0x10;
364 
365    info->driverPriv = prog;
366 
367 #ifdef DEBUG
368    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
369    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
370    info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
371 #else
372    info->optLevel = 3;
373 #endif
374 
375    ret = nv50_ir_generate_code(info);
376    if (ret) {
377       NOUVEAU_ERR("shader translation failed: %i\n", ret);
378       goto out;
379    }
380 
381    prog->code = info->bin.code;
382    prog->code_size = info->bin.codeSize;
383    prog->fixups = info->bin.relocData;
384    prog->interps = info->bin.fixupData;
385    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
386    prog->tls_space = info->bin.tlsSpace;
387    prog->cp.smem_size = info->bin.smemSize;
388    prog->mul_zero_wins = info->io.mul_zero_wins;
389    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
390 
391    prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
392    prog->vp.cull_enable =
393       ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
394    prog->vp.clip_mode = 0;
395    for (i = 0; i < info->io.cullDistances; ++i)
396       prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
397 
398    if (prog->type == PIPE_SHADER_FRAGMENT) {
399       if (info->prop.fp.writesDepth) {
400          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
401          prog->fp.flags[1] = 0x11;
402       }
403       if (info->prop.fp.usesDiscard)
404          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
405    } else
406    if (prog->type == PIPE_SHADER_GEOMETRY) {
407       switch (info->prop.gp.outputPrim) {
408       case PIPE_PRIM_LINE_STRIP:
409          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
410          break;
411       case PIPE_PRIM_TRIANGLE_STRIP:
412          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
413          break;
414       case PIPE_PRIM_POINTS:
415       default:
416          assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
417          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
418          break;
419       }
420       prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024);
421    }
422 
423    if (prog->type == PIPE_SHADER_COMPUTE) {
424       prog->cp.syms = info->bin.syms;
425       prog->cp.num_syms = info->bin.numSyms;
426    } else {
427       FREE(info->bin.syms);
428    }
429 
430    if (prog->pipe.stream_output.num_outputs)
431       prog->so = nv50_program_create_strmout_state(info,
432                                                    &prog->pipe.stream_output);
433 
434    pipe_debug_message(debug, SHADER_INFO,
435                       "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
436                       prog->type, info->bin.tlsSpace, info->bin.smemSize,
437                       prog->max_gpr, info->bin.instructions,
438                       info->bin.codeSize);
439 
440 out:
441    FREE(info);
442    return !ret;
443 }
444 
445 bool
nv50_program_upload_code(struct nv50_context * nv50,struct nv50_program * prog)446 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
447 {
448    struct nouveau_heap *heap;
449    int ret;
450    uint32_t size = align(prog->code_size, 0x40);
451    uint8_t prog_type;
452 
453    switch (prog->type) {
454    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
455    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
456    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
457    case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
458    default:
459       assert(!"invalid program type");
460       return false;
461    }
462 
463    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
464    if (ret) {
465       /* Out of space: evict everything to compactify the code segment, hoping
466        * the working set is much smaller and drifts slowly. Improve me !
467        */
468       while (heap->next) {
469          struct nv50_program *evict = heap->next->priv;
470          if (evict)
471             nouveau_heap_free(&evict->mem);
472       }
473       debug_printf("WARNING: out of code space, evicting all shaders.\n");
474       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
475       if (ret) {
476          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
477          return false;
478       }
479    }
480 
481    if (prog->type == PIPE_SHADER_COMPUTE) {
482       /* CP code must be uploaded in FP code segment. */
483       prog_type = 1;
484    } else {
485       prog->code_base = prog->mem->start;
486       prog_type = prog->type;
487    }
488 
489    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
490    if (ret < 0) {
491       nouveau_heap_free(&prog->mem);
492       return false;
493    }
494    if (ret > 0)
495       nv50->state.new_tls_space = true;
496 
497    if (prog->fixups)
498       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
499    if (prog->interps)
500       nv50_ir_apply_fixups(prog->interps, prog->code,
501                            prog->fp.force_persample_interp,
502                            false /* flatshade */,
503                            prog->fp.alphatest - 1);
504 
505    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
506                        (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
507                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
508 
509    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
510    PUSH_DATA (nv50->base.pushbuf, 0);
511 
512    return true;
513 }
514 
515 void
nv50_program_destroy(struct nv50_context * nv50,struct nv50_program * p)516 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
517 {
518    const struct pipe_shader_state pipe = p->pipe;
519    const ubyte type = p->type;
520 
521    if (p->mem)
522       nouveau_heap_free(&p->mem);
523 
524    FREE(p->code);
525 
526    FREE(p->fixups);
527    FREE(p->interps);
528    FREE(p->so);
529 
530    if (type == PIPE_SHADER_COMPUTE)
531       FREE(p->cp.syms);
532 
533    memset(p, 0, sizeof(*p));
534 
535    p->pipe = pipe;
536    p->type = type;
537 }
538