1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "nir/nir_to_tgsi.h"
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_from_mesa.h"
31 #include "tgsi/tgsi_info.h"
32 #include "tgsi/tgsi_ureg.h"
33 #include "util/debug.h"
34 
35 struct ntt_compile {
36    nir_shader *s;
37    nir_function_impl *impl;
38    struct pipe_screen *screen;
39    struct ureg_program *ureg;
40 
41    bool needs_texcoord_semantic;
42    bool any_reg_as_address;
43    bool native_integers;
44 
45    int next_addr_reg;
46    bool addr_declared[2];
47    struct ureg_dst addr_reg[2];
48 
49    unsigned loop_label;
50 
51    /* if condition set up at the end of a block, for ntt_emit_if(). */
52    struct ureg_src if_cond;
53 
54    /* TGSI temps for our NIR SSA and register values. */
55    struct ureg_dst *reg_temp;
56    struct ureg_dst *ssa_temp;
57 
58    nir_instr_liveness *liveness;
59 
60    /* Mappings from driver_location to TGSI input/output number.
61     *
62     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
63     * their numbers assigned incrementally, unlike inputs or constants.
64     */
65    struct ureg_src *input_index_map;
66    uint64_t centroid_inputs;
67 
68    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
69 };
70 
71 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
72 
73 static unsigned
ntt_64bit_write_mask(unsigned write_mask)74 ntt_64bit_write_mask(unsigned write_mask)
75 {
76    return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
77 }
78 
79 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)80 ntt_64bit_1f(struct ntt_compile *c)
81 {
82    return ureg_imm4u(c->ureg,
83                      0x00000000, 0x3ff00000,
84                      0x00000000, 0x3ff00000);
85 }
86 
87 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)88 ntt_shader_input_type(struct ntt_compile *c,
89                       struct nir_variable *var)
90 {
91    switch (c->s->info.stage) {
92    case MESA_SHADER_GEOMETRY:
93    case MESA_SHADER_TESS_EVAL:
94    case MESA_SHADER_TESS_CTRL:
95       if (glsl_type_is_array(var->type))
96          return glsl_get_array_element(var->type);
97       else
98          return var->type;
99    default:
100       return var->type;
101    }
102 }
103 
104 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)105 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
106                             unsigned *semantic_name, unsigned *semantic_index)
107 {
108    /* We want to use most of tgsi_get_gl_varying_semantic(), but the
109     * !texcoord shifting has already been applied, so avoid that.
110     */
111    if (!c->needs_texcoord_semantic &&
112        (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
113       *semantic_name = TGSI_SEMANTIC_GENERIC;
114       *semantic_index = location - VARYING_SLOT_VAR0;
115       return;
116    }
117 
118    tgsi_get_gl_varying_semantic(location, true,
119                                 semantic_name, semantic_index);
120 }
121 
122 /* TGSI varying declarations have a component usage mask associated (used by
123  * r600 and svga).
124  */
125 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)126 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
127                     bool is_64)
128 {
129    uint32_t usage_mask =
130       u_bit_consecutive(start_component, num_components);
131 
132    if (is_64) {
133       if (start_component >= 2)
134          usage_mask >>= 2;
135 
136       uint32_t tgsi_usage_mask = 0;
137 
138       if (usage_mask & TGSI_WRITEMASK_X)
139          tgsi_usage_mask |= TGSI_WRITEMASK_XY;
140       if (usage_mask & TGSI_WRITEMASK_Y)
141          tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
142 
143       return tgsi_usage_mask;
144    } else {
145       return usage_mask;
146    }
147 }
148 
149 /* TGSI varying declarations have a component usage mask associated (used by
150  * r600 and svga).
151  */
152 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)153 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
154 {
155    const struct glsl_type *type_without_array =
156       glsl_without_array(var->type);
157    unsigned num_components = glsl_get_vector_elements(type_without_array);
158    if (num_components == 0) /* structs */
159       num_components = 4;
160 
161    return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
162                               glsl_type_is_64bit(type_without_array));
163 }
164 
165 static void
ntt_setup_inputs(struct ntt_compile * c)166 ntt_setup_inputs(struct ntt_compile *c)
167 {
168    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
169       return;
170 
171    unsigned num_inputs = 0;
172    int num_input_arrays = 0;
173 
174    nir_foreach_shader_in_variable(var, c->s) {
175       const struct glsl_type *type = ntt_shader_input_type(c, var);
176       unsigned array_len =
177          glsl_count_attribute_slots(type, false);
178 
179       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
180    }
181 
182    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
183 
184    nir_foreach_shader_in_variable(var, c->s) {
185       const struct glsl_type *type = ntt_shader_input_type(c, var);
186       unsigned array_len =
187          glsl_count_attribute_slots(type, false);
188 
189       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
190       unsigned sample_loc;
191       struct ureg_src decl;
192 
193       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
194          interpolation =
195             tgsi_get_interp_mode(var->data.interpolation,
196                                  var->data.location == VARYING_SLOT_COL0 ||
197                                  var->data.location == VARYING_SLOT_COL1);
198 
199          if (var->data.location == VARYING_SLOT_POS)
200             interpolation = TGSI_INTERPOLATE_LINEAR;
201       }
202 
203       unsigned semantic_name, semantic_index;
204       ntt_get_gl_varying_semantic(c, var->data.location,
205                                   &semantic_name, &semantic_index);
206 
207       if (var->data.sample) {
208          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
209       } else if (var->data.centroid) {
210          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
211          c->centroid_inputs |= (BITSET_MASK(array_len) <<
212                                 var->data.driver_location);
213       } else {
214          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
215       }
216 
217       unsigned array_id = 0;
218       if (glsl_type_is_array(type))
219          array_id = ++num_input_arrays;
220 
221       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
222 
223       decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
224                                                     semantic_name,
225                                                     semantic_index,
226                                                     interpolation,
227                                                     0,
228                                                     sample_loc,
229                                                     var->data.driver_location,
230                                                     usage_mask,
231                                                     array_id, array_len);
232 
233       if (semantic_name == TGSI_SEMANTIC_FACE) {
234          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
235          /* NIR is ~0 front and 0 back, while TGSI is +1 front */
236          ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
237          decl = ureg_src(temp);
238       }
239 
240       for (unsigned i = 0; i < array_len; i++) {
241          c->input_index_map[var->data.driver_location + i] = decl;
242          c->input_index_map[var->data.driver_location + i].Index += i;
243       }
244    }
245 }
246 
247 static void
ntt_setup_uniforms(struct ntt_compile * c)248 ntt_setup_uniforms(struct ntt_compile *c)
249 {
250    struct pipe_screen *screen = c->screen;
251    bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
252 
253    nir_foreach_uniform_variable(var, c->s) {
254       if (glsl_type_is_image(var->type)) {
255          c->images[var->data.binding] = ureg_DECL_image(c->ureg,
256                                                         var->data.binding,
257                                                         TGSI_TEXTURE_2D,
258                                                         var->data.image.format,
259                                                         !var->data.read_only,
260                                                         false);
261       } else {
262          unsigned size;
263          if (packed) {
264             size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
265                                                        var->data.bindless), 4);
266          } else {
267             size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
268          }
269 
270          for (unsigned i = 0; i < size; i++)
271             ureg_DECL_constant(c->ureg, var->data.driver_location + i);
272       }
273    }
274 
275    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
276       ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location + 1);
277    }
278 
279    for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
280       if (c->s->info.textures_used & (1 << i))
281          ureg_DECL_sampler(c->ureg, i);
282    }
283 }
284 
285 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)286 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
287 {
288    foreach_list_typed(nir_register, nir_reg, node, list) {
289       struct ureg_dst decl;
290       if (nir_reg->num_array_elems == 0) {
291          uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
292          if (nir_reg->bit_size == 64) {
293             if (nir_reg->num_components > 2) {
294                fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
295                        nir_reg->num_components, nir_reg->index);
296             }
297 
298             write_mask = ntt_64bit_write_mask(write_mask);
299          }
300 
301          decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
302       } else {
303          decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
304                                           true);
305       }
306       c->reg_temp[nir_reg->index] = decl;
307    }
308 }
309 
310 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)311 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
312 {
313    uint32_t values[4];
314    int num_components = instr->def.num_components;
315 
316    if (instr->def.bit_size == 32) {
317       for (int i = 0; i < num_components; i++)
318          values[i] = instr->value[i].u32;
319    } else {
320       assert(num_components <= 2);
321       for (int i = 0; i < num_components; i++) {
322          values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
323          values[i * 2 + 1] = instr->value[i].u64 >> 32;
324       }
325       num_components *= 2;
326    }
327 
328    return ureg_DECL_immediate_uint(c->ureg, values, num_components);
329 }
330 
331 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr)332 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
333 {
334    if (c->any_reg_as_address) {
335       /* Make sure we're getting the refcounting right even on any_reg
336        * drivers.
337        */
338       c->next_addr_reg++;
339 
340       return ureg_scalar(addr, 0);
341    }
342 
343    assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
344 
345    if (!c->addr_declared[c->next_addr_reg]) {
346       c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
347                                                      TGSI_WRITEMASK_X);
348       c->addr_declared[c->next_addr_reg] = true;
349    }
350 
351    ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
352    return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
353 }
354 
355 static void
ntt_put_reladdr(struct ntt_compile * c)356 ntt_put_reladdr(struct ntt_compile *c)
357 {
358    c->next_addr_reg--;
359    assert(c->next_addr_reg >= 0);
360 }
361 
362 static void
ntt_reladdr_dst_put(struct ntt_compile * c,struct ureg_dst dst)363 ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
364 {
365    if (c->any_reg_as_address)
366       return;
367 
368    if (dst.Indirect)
369       ntt_put_reladdr(c);
370    if (dst.DimIndirect)
371       ntt_put_reladdr(c);
372 }
373 
374 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)375 ntt_get_src(struct ntt_compile *c, nir_src src)
376 {
377    if (src.is_ssa) {
378       if (src.ssa->parent_instr->type == nir_instr_type_load_const)
379          return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
380 
381       return ureg_src(c->ssa_temp[src.ssa->index]);
382    } else {
383       nir_register *reg = src.reg.reg;
384       struct ureg_dst reg_temp = c->reg_temp[reg->index];
385       reg_temp.Index += src.reg.base_offset;
386 
387       if (src.reg.indirect) {
388          struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
389          return ureg_src_indirect(ureg_src(reg_temp),
390                                   ntt_reladdr(c, offset));
391       } else {
392          return ureg_src(reg_temp);
393       }
394    }
395 }
396 
397 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)398 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
399 {
400    nir_alu_src src = instr->src[i];
401    struct ureg_src usrc = ntt_get_src(c, src.src);
402 
403    if (nir_src_bit_size(src.src) == 64) {
404       int chan0 = 0, chan1 = 1;
405       if (nir_op_infos[instr->op].input_sizes[i] == 0) {
406          chan0 = ffs(instr->dest.write_mask) - 1;
407          chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
408          if (chan1 == -1)
409             chan1 = chan0;
410       }
411       usrc = ureg_swizzle(usrc,
412                           src.swizzle[chan0] * 2,
413                           src.swizzle[chan0] * 2 + 1,
414                           src.swizzle[chan1] * 2,
415                           src.swizzle[chan1] * 2 + 1);
416    } else {
417       usrc = ureg_swizzle(usrc,
418                           src.swizzle[0],
419                           src.swizzle[1],
420                           src.swizzle[2],
421                           src.swizzle[3]);
422    }
423 
424    if (src.abs)
425       usrc = ureg_abs(usrc);
426    if (src.negate)
427       usrc = ureg_negate(usrc);
428 
429    return usrc;
430 }
431 
432 static struct ureg_dst *
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)433 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
434 {
435    struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
436 
437    uint32_t writemask = BITSET_MASK(ssa->num_components);
438    if (ssa->bit_size == 64)
439       writemask = ntt_64bit_write_mask(writemask);
440 
441    c->ssa_temp[ssa->index] = ureg_writemask(temp, writemask);
442 
443    return &c->ssa_temp[ssa->index];
444 }
445 
446 static struct ureg_dst *
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)447 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
448 {
449    if (dest->is_ssa)
450       return ntt_get_ssa_def_decl(c, &dest->ssa);
451    else
452       return &c->reg_temp[dest->reg.reg->index];
453 }
454 
455 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)456 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
457 {
458    struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
459 
460    if (!dest->is_ssa) {
461       dst.Index += dest->reg.base_offset;
462 
463       if (dest->reg.indirect) {
464          struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
465          dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
466       }
467    }
468 
469    return dst;
470 }
471 
472 /* For an SSA dest being populated by a constant src, replace the storage with
473  * a copy of the ureg_src.
474  */
475 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)476 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
477 {
478    if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
479        src.SwizzleX == TGSI_SWIZZLE_X &&
480        (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
481        (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
482        (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
483       switch (src.File) {
484       case TGSI_FILE_IMMEDIATE:
485       case TGSI_FILE_INPUT:
486       case TGSI_FILE_CONSTANT:
487       case TGSI_FILE_SYSTEM_VALUE:
488          c->ssa_temp[def->index] = ureg_dst(src);
489          return;
490       }
491    }
492 
493    ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
494 }
495 
496 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)497 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
498 {
499    if (dest->is_ssa)
500       ntt_store_def(c, &dest->ssa, src);
501    else {
502       struct ureg_dst dst = ntt_get_dest(c, dest);
503       ureg_MOV(c->ureg, dst, src);
504    }
505 }
506 
507 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)508 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
509                 struct ureg_dst dst,
510                 struct ureg_src src0,
511                 struct ureg_src src1)
512 {
513    unsigned i;
514    int num_src;
515 
516    /* POW is the only 2-operand scalar op. */
517    if (tgsi_op  == TGSI_OPCODE_POW) {
518       num_src = 2;
519    } else {
520       num_src = 1;
521       src1 = src0;
522    }
523 
524    for (i = 0; i < 4; i++) {
525       if (dst.WriteMask & (1 << i)) {
526          struct ureg_dst this_dst = dst;
527          struct ureg_src srcs[2] = {
528             ureg_scalar(src0, i),
529             ureg_scalar(src1, i),
530          };
531          this_dst.WriteMask = (1 << i);
532 
533          ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
534       }
535    }
536 }
537 
538 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)539 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
540 {
541    struct ureg_src src[4];
542    struct ureg_dst dst;
543    unsigned i;
544    int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
545    int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
546    int num_srcs = nir_op_infos[instr->op].num_inputs;
547 
548    assert(num_srcs <= ARRAY_SIZE(src));
549    for (i = 0; i < num_srcs; i++)
550       src[i] = ntt_get_alu_src(c, instr, i);
551    dst = ntt_get_dest(c, &instr->dest.dest);
552 
553    if (instr->dest.saturate)
554       dst.Saturate = true;
555 
556    if (dst_64)
557       dst.WriteMask = ntt_64bit_write_mask(instr->dest.write_mask);
558    else
559       dst.WriteMask = instr->dest.write_mask;
560 
561    static enum tgsi_opcode op_map[][2] = {
562       [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
563 
564       /* fabs/fneg 32-bit are special-cased below. */
565       [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
566       [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
567 
568       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
569       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
570       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
571       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
572       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
573       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
574       [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
575       [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
576       [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
577       [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
578 
579       [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
580       [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
581       [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
582 
583       /* The conversions will have one combination of src and dst bitsize. */
584       [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
585       [nir_op_f2f64] = { TGSI_OPCODE_F2D },
586       [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
587 
588       [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
589       [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
590       [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
591       [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
592       [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
593       [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
594       [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
595       [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
596 
597       [nir_op_slt] = { TGSI_OPCODE_SLT },
598       [nir_op_sge] = { TGSI_OPCODE_SGE },
599       [nir_op_seq] = { TGSI_OPCODE_SEQ },
600       [nir_op_sne] = { TGSI_OPCODE_SNE },
601 
602       [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
603       [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
604       [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
605       [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
606 
607       [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
608       [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
609       [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
610       [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
611 
612       [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
613       [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
614 
615       [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
616       [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
617       [nir_op_fsign] = { TGSI_OPCODE_SSG },
618       [nir_op_isign] = { TGSI_OPCODE_ISSG },
619       [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
620       [nir_op_fddx] = { TGSI_OPCODE_DDX },
621       [nir_op_fddy] = { TGSI_OPCODE_DDY },
622       [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
623       [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
624       [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
625       [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
626       [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
627       [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
628       [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
629       [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
630       [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
631       [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
632       [nir_op_bit_count] = { TGSI_OPCODE_POPC },
633       [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
634       [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
635       [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
636       [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
637       [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
638       [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
639       [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
640       [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
641       [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
642       [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
643       [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
644       [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
645       [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
646       [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
647 
648       /* These bitwise ops don't care about 32 vs 64 types, so they have the
649        * same TGSI op.
650        */
651       [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
652       [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
653       [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
654       [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
655 
656       [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
657       [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
658       [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
659       [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
660       [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
661       [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
662       [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
663       [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
664    };
665 
666    /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
667     * of .xy.  Store to a temp and move it to the real dst.
668     */
669    bool tgsi_64bit_compare = src_64 && !dst_64 &&
670       (num_srcs == 2 ||
671         nir_op_infos[instr->op].output_type == nir_type_bool32) &&
672       (dst.WriteMask != TGSI_WRITEMASK_X);
673 
674    /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
675     * channels and will need to get fixed up.
676     */
677    bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
678                                   num_srcs == 1 && !tgsi_64bit_compare &&
679                                   (dst.WriteMask & ~TGSI_WRITEMASK_XY));
680 
681    struct ureg_dst real_dst = ureg_dst_undef();
682    if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
683       real_dst = dst;
684       dst = ureg_DECL_temporary(c->ureg);
685    }
686 
687    bool table_op64 = src_64;
688    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
689       /* The normal path for NIR to TGSI ALU op translation */
690       ureg_insn(c->ureg, op_map[instr->op][table_op64],
691                 &dst, 1, src, num_srcs, false);
692    } else {
693       /* Special cases for NIR to TGSI ALU op translation. */
694 
695       /* TODO: Use something like the ntt_store() path for the MOV calls so we
696        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
697        */
698 
699       switch (instr->op) {
700       case nir_op_u2u64:
701          ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
702                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
703                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
704                   ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
705          break;
706 
707       case nir_op_i2i32:
708       case nir_op_u2u32:
709          assert(src_64);
710          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
711                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
712                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
713          break;
714 
715       case nir_op_fabs:
716          ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
717          break;
718 
719       case nir_op_fsat:
720          if (dst_64) {
721             ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
722             ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
723          } else {
724             ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
725          }
726          break;
727 
728       case nir_op_fneg:
729          ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
730          break;
731 
732          /* NOTE: TGSI 32-bit math ops have the old "one source channel
733           * replicated to all dst channels" behavior, while 64 is normal mapping
734           * of src channels to dst.
735           */
736       case nir_op_frcp:
737          assert(!dst_64);
738          ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
739          break;
740 
741       case nir_op_frsq:
742          assert(!dst_64);
743          ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
744          break;
745 
746       case nir_op_fsqrt:
747          assert(!dst_64);
748          ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
749          break;
750 
751       case nir_op_fexp2:
752          assert(!dst_64);
753          ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
754          break;
755 
756       case nir_op_flog2:
757          assert(!dst_64);
758          ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
759          break;
760 
761       case nir_op_b2f32:
762          ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
763          break;
764 
765       case nir_op_b2f64:
766          ureg_AND(c->ureg, dst,
767                   ureg_swizzle(src[0],
768                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
769                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
770                   ntt_64bit_1f(c));
771          break;
772 
773       case nir_op_f2b32:
774          if (src_64)
775             ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
776          else
777             ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
778          break;
779 
780       case nir_op_i2b32:
781          if (src_64) {
782             ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
783          } else
784             ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
785          break;
786 
787       case nir_op_b2i32:
788          ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
789          break;
790 
791       case nir_op_b2i64:
792          ureg_AND(c->ureg, dst,
793                   ureg_swizzle(src[0],
794                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
795                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
796                   ureg_imm4u(c->ureg, 1, 0, 1, 0));
797          break;
798 
799       case nir_op_fsin:
800          ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
801          break;
802 
803       case nir_op_fcos:
804          ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
805          break;
806 
807       case nir_op_fsub:
808          assert(!dst_64);
809          ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
810          break;
811 
812       case nir_op_isub:
813          assert(!dst_64);
814          ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
815          break;
816 
817          /* XXX: carry */
818 
819       case nir_op_fmod:
820          unreachable("should be handled by .lower_fmod = true");
821          break;
822 
823       case nir_op_fpow:
824          ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
825          break;
826 
827       case nir_op_flrp:
828          ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
829          break;
830 
831       case nir_op_pack_64_2x32_split:
832          ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
833                   ureg_swizzle(src[0],
834                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
835                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
836          ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
837                   ureg_swizzle(src[1],
838                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
839                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
840          break;
841 
842       case nir_op_unpack_64_2x32_split_x:
843          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
844                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
845                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
846          break;
847 
848       case nir_op_unpack_64_2x32_split_y:
849          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
850                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
851                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
852          break;
853 
854       case nir_op_b32csel:
855          if (nir_src_bit_size(instr->src[1].src) == 64) {
856             ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
857                                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
858                                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
859                       src[1], src[2]);
860          } else {
861             ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
862          }
863          break;
864 
865       case nir_op_fcsel:
866          /* NIR is src0 != 0 ? src1 : src2.
867           * TGSI is src0 < 0 ? src1 : src2.
868           *
869           * However, fcsel so far as I can find only appears on
870           * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
871           */
872          ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[2], src[1]);
873          break;
874 
875          /* It would be nice if we could get this left as scalar in NIR, since
876           * the TGSI op is scalar.
877           */
878       case nir_op_frexp_sig:
879       case nir_op_frexp_exp: {
880          assert(src_64);
881          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
882 
883          for (int chan = 0; chan < 2; chan++) {
884             int wm = 1 << chan;
885 
886             if (!(instr->dest.write_mask & wm))
887                continue;
888 
889             struct ureg_dst dsts[2] = { temp, temp };
890             if (instr->op == nir_op_frexp_sig) {
891                dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
892             } else {
893                dsts[1] = ureg_writemask(dst, wm);
894             }
895 
896             struct ureg_src chan_src = ureg_swizzle(src[0],
897                                                     chan * 2, chan * 2 + 1,
898                                                     chan * 2, chan * 2 + 1);
899 
900             ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
901                       dsts, 2,
902                       &chan_src, 1, false);
903          }
904 
905          ureg_release_temporary(c->ureg, temp);
906          break;
907       }
908 
909       case nir_op_ldexp:
910          assert(dst_64); /* 32bit handled in table. */
911          ureg_DLDEXP(c->ureg, dst, src[0],
912                      ureg_swizzle(src[1],
913                                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
914                                   TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
915          break;
916 
917       case nir_op_vec4:
918       case nir_op_vec3:
919       case nir_op_vec2:
920          unreachable("covered by nir_lower_vec_to_movs()");
921 
922       default:
923          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
924          unreachable("Unknown NIR opcode");
925       }
926    }
927 
928    /* 64-bit op fixup movs */
929    if (!ureg_dst_is_undef(real_dst)) {
930       if (tgsi_64bit_compare) {
931          ureg_MOV(c->ureg, real_dst,
932                   ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
933       } else {
934          assert(tgsi_64bit_downconvert);
935          uint8_t swizzle[] = {0, 0, 0, 0};
936          uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
937          if (second_bit)
938             swizzle[ffs(second_bit) - 1] = 1;
939          ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
940                                                   swizzle[0],
941                                                   swizzle[1],
942                                                   swizzle[2],
943                                                   swizzle[3]));
944       }
945       ureg_release_temporary(c->ureg, dst);
946    }
947 }
948 
949 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)950 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
951                       nir_src src)
952 {
953    if (nir_src_is_const(src)) {
954       usrc.Index += nir_src_as_uint(src);
955       return usrc;
956    } else {
957       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
958    }
959 }
960 
961 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)962 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
963                       nir_src src)
964 {
965    if (nir_src_is_const(src)) {
966       dst.Index += nir_src_as_uint(src);
967       return dst;
968    } else {
969       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
970    }
971 }
972 
973 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)974 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
975                          nir_src src)
976 {
977    if (nir_src_is_const(src)) {
978       return ureg_src_dimension(usrc, nir_src_as_uint(src));
979    } else {
980       return ureg_src_dimension_indirect(usrc,
981                                          ntt_reladdr(c, ntt_get_src(c, src)),
982                                          1);
983    }
984 }
985 
986 static void
ntt_emit_load_uniform(struct ntt_compile * c,nir_intrinsic_instr * instr)987 ntt_emit_load_uniform(struct ntt_compile *c, nir_intrinsic_instr *instr)
988 {
989    struct ureg_src src =
990       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_CONSTANT,
991                                                  nir_intrinsic_base(instr)),
992                             instr->src[0]);
993    ntt_store(c, &instr->dest, src);
994 }
995 
996 /* Some load operations in NIR will have a fractional offset that we need to
997  * swizzle down before storing to the result register.
998  */
999 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1000 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1001 {
1002    return ureg_swizzle(src,
1003                        frac,
1004                        frac + MIN2(num_components - 1, 1),
1005                        frac + MIN2(num_components - 1, 2),
1006                        frac + MIN2(num_components - 1, 3));
1007 }
1008 
1009 /* PIPE_CAP_LOAD_CONSTBUF */
1010 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1011 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1012 {
1013    /* XXX: Emit a TGSI_OPCODE_LOAD instr. */
1014 }
1015 
1016 /* !PIPE_CAP_LOAD_CONSTBUF */
1017 static void
ntt_emit_load_ubo_vec4(struct ntt_compile * c,nir_intrinsic_instr * instr)1018 ntt_emit_load_ubo_vec4(struct ntt_compile *c, nir_intrinsic_instr *instr)
1019 {
1020    int bit_size = nir_dest_bit_size(instr->dest);
1021    assert(bit_size == 32 || instr->num_components <= 2);
1022 
1023    struct ureg_src src;
1024    if (nir_src_is_const(instr->src[1])) {
1025       src = ureg_src_register(TGSI_FILE_CONSTANT,
1026                               nir_src_as_uint(instr->src[1]));
1027    } else {
1028       src = ureg_src_indirect(ureg_src_register(TGSI_FILE_CONSTANT, 0),
1029                               ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
1030    }
1031 
1032    int start_component = nir_intrinsic_component(instr);
1033    if (bit_size == 64)
1034       start_component *= 2;
1035 
1036    src = ntt_shift_by_frac(src, start_component,
1037                            instr->num_components * bit_size / 32);
1038 
1039    if (nir_src_is_const(instr->src[0])) {
1040       src = ureg_src_dimension(src, nir_src_as_uint(instr->src[0]) + 1);
1041    } else {
1042       struct ureg_src block_index = ntt_get_src(c, instr->src[0]);
1043 
1044       src = ureg_src_dimension_indirect(src, ntt_reladdr(c, block_index), 1);
1045    }
1046 
1047    ntt_store(c, &instr->dest, src);
1048 }
1049 
1050 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1051 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1052 {
1053    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1054    unsigned qualifier = 0;
1055 
1056    if (access & ACCESS_COHERENT)
1057       qualifier |= TGSI_MEMORY_COHERENT;
1058    if (access & ACCESS_VOLATILE)
1059       qualifier |= TGSI_MEMORY_VOLATILE;
1060    if (access & ACCESS_RESTRICT)
1061       qualifier |= TGSI_MEMORY_RESTRICT;
1062 
1063    return qualifier;
1064 }
1065 
1066 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1067 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1068              nir_variable_mode mode)
1069 {
1070    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1071                     instr->intrinsic == nir_intrinsic_store_shared);
1072    bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||
1073                     instr->intrinsic == nir_intrinsic_load_shared);
1074    unsigned opcode;
1075    struct ureg_src src[4];
1076    int num_src = 0;
1077    int nir_src;
1078 
1079    struct ureg_src memory;
1080    switch (mode) {
1081    case nir_var_mem_ssbo:
1082       /* XXX: TGSI should have BUFFER declarations for the SSBOs.  Needed for
1083        * r600, nv50, llvmpipe.
1084        */
1085       memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1086                                      instr->src[is_store ? 1 : 0]);
1087       nir_src = 1;
1088       break;
1089    case nir_var_mem_shared:
1090       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1091       nir_src = 0;
1092       break;
1093    default:
1094       unreachable("unknown memory type");
1095    }
1096 
1097    if (is_store) {
1098       src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
1099       src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1100    } else {
1101       src[num_src++] = memory;
1102       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1103          src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
1104          if (!is_load)
1105             src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
1106       }
1107    }
1108 
1109 
1110    switch (instr->intrinsic) {
1111    case nir_intrinsic_ssbo_atomic_add:
1112    case nir_intrinsic_shared_atomic_add:
1113       opcode = TGSI_OPCODE_ATOMUADD;
1114       break;
1115    case nir_intrinsic_ssbo_atomic_fadd:
1116    case nir_intrinsic_shared_atomic_fadd:
1117       opcode = TGSI_OPCODE_ATOMFADD;
1118       break;
1119    case nir_intrinsic_ssbo_atomic_imin:
1120    case nir_intrinsic_shared_atomic_imin:
1121       opcode = TGSI_OPCODE_ATOMIMIN;
1122       break;
1123    case nir_intrinsic_ssbo_atomic_imax:
1124    case nir_intrinsic_shared_atomic_imax:
1125       opcode = TGSI_OPCODE_ATOMIMAX;
1126       break;
1127    case nir_intrinsic_ssbo_atomic_umin:
1128    case nir_intrinsic_shared_atomic_umin:
1129       opcode = TGSI_OPCODE_ATOMUMIN;
1130       break;
1131    case nir_intrinsic_ssbo_atomic_umax:
1132    case nir_intrinsic_shared_atomic_umax:
1133       opcode = TGSI_OPCODE_ATOMUMAX;
1134       break;
1135    case nir_intrinsic_ssbo_atomic_and:
1136    case nir_intrinsic_shared_atomic_and:
1137       opcode = TGSI_OPCODE_ATOMAND;
1138       break;
1139    case nir_intrinsic_ssbo_atomic_or:
1140    case nir_intrinsic_shared_atomic_or:
1141       opcode = TGSI_OPCODE_ATOMOR;
1142       break;
1143    case nir_intrinsic_ssbo_atomic_xor:
1144    case nir_intrinsic_shared_atomic_xor:
1145       opcode = TGSI_OPCODE_ATOMXOR;
1146       break;
1147    case nir_intrinsic_ssbo_atomic_exchange:
1148    case nir_intrinsic_shared_atomic_exchange:
1149       opcode = TGSI_OPCODE_ATOMXCHG;
1150       break;
1151    case nir_intrinsic_ssbo_atomic_comp_swap:
1152    case nir_intrinsic_shared_atomic_comp_swap:
1153       opcode = TGSI_OPCODE_ATOMCAS;
1154       src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
1155       break;
1156    case nir_intrinsic_load_ssbo:
1157    case nir_intrinsic_load_shared:
1158       opcode = TGSI_OPCODE_LOAD;
1159       break;
1160    case nir_intrinsic_store_ssbo:
1161    case nir_intrinsic_store_shared:
1162       opcode = TGSI_OPCODE_STORE;
1163       break;
1164    case nir_intrinsic_get_ssbo_size:
1165       opcode = TGSI_OPCODE_RESQ;
1166       break;
1167    default:
1168       unreachable("unknown memory op");
1169    }
1170 
1171    unsigned qualifier = 0;
1172    if (mode == nir_var_mem_ssbo &&
1173        instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1174       qualifier = ntt_get_access_qualifier(instr);
1175    }
1176 
1177    struct ureg_dst dst;
1178    if (is_store) {
1179       dst = ureg_dst(memory);
1180 
1181       unsigned write_mask = nir_intrinsic_write_mask(instr);
1182       if (nir_src_bit_size(instr->src[0]) == 64)
1183          write_mask = ntt_64bit_write_mask(write_mask);
1184       dst = ureg_writemask(dst, write_mask);
1185    } else {
1186       dst = ntt_get_dest(c, &instr->dest);
1187    }
1188 
1189    ureg_memory_insn(c->ureg, opcode,
1190                     &dst, 1,
1191                     src, num_src,
1192                     qualifier,
1193                     TGSI_TEXTURE_BUFFER,
1194                     0 /* format: unused */);
1195 }
1196 
1197 static enum tgsi_texture_type
tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array)1198 tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
1199 {
1200    switch (dim) {
1201    case GLSL_SAMPLER_DIM_1D:
1202       return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
1203    case GLSL_SAMPLER_DIM_2D:
1204       return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
1205    case GLSL_SAMPLER_DIM_3D:
1206       return TGSI_TEXTURE_3D;
1207    case GLSL_SAMPLER_DIM_CUBE:
1208       return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
1209    case GLSL_SAMPLER_DIM_RECT:
1210       return TGSI_TEXTURE_RECT;
1211    case GLSL_SAMPLER_DIM_BUF:
1212       return TGSI_TEXTURE_BUFFER;
1213    default:
1214       unreachable("unknown sampler dim");
1215    }
1216 }
1217 
1218 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)1219 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
1220 {
1221    unsigned op;
1222    struct ureg_src srcs[3];
1223    int num_src = 0;
1224 
1225    enum tgsi_texture_type target =
1226       tgsi_target_from_sampler_dim(nir_intrinsic_image_dim(instr),
1227                                    nir_intrinsic_image_array(instr));
1228 
1229    struct ureg_src resource =
1230       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
1231                             instr->src[0]);
1232 
1233    struct ureg_dst dst;
1234    if (instr->intrinsic == nir_intrinsic_image_store) {
1235       dst = ureg_dst(resource);
1236    } else {
1237       srcs[num_src++] = resource;
1238       dst = ntt_get_dest(c, &instr->dest);
1239    }
1240 
1241    if (instr->intrinsic != nir_intrinsic_image_size) {
1242       srcs[num_src++] = ntt_get_src(c, instr->src[1]); /* coord */
1243       /* XXX: src[2] sample index to coord.z (2d) or coord.w (2darray) */
1244       if (instr->intrinsic != nir_intrinsic_image_load) {
1245          srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
1246          if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
1247             srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
1248       }
1249    }
1250 
1251    switch (instr->intrinsic) {
1252    case nir_intrinsic_image_load:
1253       op = TGSI_OPCODE_LOAD;
1254       break;
1255    case nir_intrinsic_image_store:
1256       op = TGSI_OPCODE_STORE;
1257       break;
1258    case nir_intrinsic_image_size:
1259       op = TGSI_OPCODE_RESQ;
1260       break;
1261    case nir_intrinsic_image_atomic_add:
1262       op = TGSI_OPCODE_ATOMUADD;
1263       break;
1264    case nir_intrinsic_image_atomic_fadd:
1265       op = TGSI_OPCODE_ATOMFADD;
1266       break;
1267    case nir_intrinsic_image_atomic_imin:
1268       op = TGSI_OPCODE_ATOMIMIN;
1269       break;
1270    case nir_intrinsic_image_atomic_umin:
1271       op = TGSI_OPCODE_ATOMUMIN;
1272       break;
1273    case nir_intrinsic_image_atomic_imax:
1274       op = TGSI_OPCODE_ATOMIMAX;
1275       break;
1276    case nir_intrinsic_image_atomic_umax:
1277       op = TGSI_OPCODE_ATOMUMAX;
1278       break;
1279    case nir_intrinsic_image_atomic_and:
1280       op = TGSI_OPCODE_ATOMAND;
1281       break;
1282    case nir_intrinsic_image_atomic_or:
1283       op = TGSI_OPCODE_ATOMOR;
1284       break;
1285    case nir_intrinsic_image_atomic_xor:
1286       op = TGSI_OPCODE_ATOMXOR;
1287       break;
1288    case nir_intrinsic_image_atomic_exchange:
1289       op = TGSI_OPCODE_ATOMXCHG;
1290       break;
1291    case nir_intrinsic_image_atomic_comp_swap:
1292       op = TGSI_OPCODE_ATOMCAS;
1293       break;
1294    default:
1295       unreachable("bad op");
1296    }
1297 
1298    ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
1299                     ntt_get_access_qualifier(instr),
1300                     target,
1301                     nir_intrinsic_format(instr));
1302 }
1303 
1304 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)1305 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
1306 {
1307    uint32_t frac = nir_intrinsic_component(instr);
1308    uint32_t num_components = instr->num_components;
1309    unsigned base = nir_intrinsic_base(instr);
1310    struct ureg_src input;
1311    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1312    bool is_64 = nir_dest_bit_size(instr->dest) == 64;
1313 
1314    if (c->s->info.stage == MESA_SHADER_VERTEX) {
1315       input = ureg_DECL_vs_input(c->ureg, base);
1316       for (int i = 1; i < semantics.num_slots; i++)
1317          ureg_DECL_vs_input(c->ureg, base + i);
1318    } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
1319       unsigned semantic_name, semantic_index;
1320       ntt_get_gl_varying_semantic(c, semantics.location,
1321                                   &semantic_name, &semantic_index);
1322 
1323       /* XXX: ArrayID is used in r600 gs inputs */
1324       uint32_t array_id = 0;
1325 
1326       input = ureg_DECL_input_layout(c->ureg,
1327                                      semantic_name,
1328                                      semantic_index,
1329                                      base,
1330                                      ntt_tgsi_usage_mask(frac,
1331                                                          instr->num_components,
1332                                                          is_64),
1333                                      array_id,
1334                                      semantics.num_slots);
1335    } else {
1336       input = c->input_index_map[base];
1337    }
1338 
1339    if (is_64)
1340       num_components *= 2;
1341 
1342    input = ntt_shift_by_frac(input, frac, num_components);
1343 
1344    switch (instr->intrinsic) {
1345    case nir_intrinsic_load_input:
1346       input = ntt_ureg_src_indirect(c, input, instr->src[0]);
1347       ntt_store(c, &instr->dest, input);
1348       break;
1349 
1350    case nir_intrinsic_load_per_vertex_input:
1351       input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1352       input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
1353       ntt_store(c, &instr->dest, input);
1354       break;
1355 
1356    case nir_intrinsic_load_interpolated_input: {
1357       input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1358 
1359       nir_intrinsic_instr *bary_instr =
1360          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1361 
1362       switch (bary_instr->intrinsic) {
1363       case nir_intrinsic_load_barycentric_pixel:
1364          ntt_store(c, &instr->dest, input);
1365          break;
1366 
1367       case nir_intrinsic_load_barycentric_centroid:
1368          /* If the input was declared centroid, then there's no need to
1369           * emit the extra TGSI interp instruction, we can just read the
1370           * input.
1371           */
1372          if (c->centroid_inputs & (1 << nir_intrinsic_base(instr))) {
1373             ntt_store(c, &instr->dest, input);
1374          } else {
1375             ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
1376                                  input);
1377          }
1378          break;
1379 
1380       case nir_intrinsic_load_barycentric_at_sample:
1381          ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
1382                             ureg_imm1u(c->ureg,
1383                                        nir_src_as_uint(bary_instr->src[0])));
1384          break;
1385 
1386       case nir_intrinsic_load_barycentric_at_offset:
1387          /* We stored the offset in the fake "bary" dest. */
1388          ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
1389                             ntt_get_src(c, instr->src[0]));
1390          break;
1391 
1392       default:
1393          unreachable("bad barycentric interp intrinsic\n");
1394       }
1395       break;
1396    }
1397 
1398    default:
1399       unreachable("bad load input intrinsic\n");
1400    }
1401 }
1402 
1403 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1404 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1405 {
1406    /* TODO: When making an SSA def's storage, we should check if it's only
1407     * used as the source of a store_output and point it at our
1408     * TGSI_FILE_OUTPUT instead of generating the extra MOV here.
1409     */
1410    uint32_t base = nir_intrinsic_base(instr);
1411    struct ureg_src src = ntt_get_src(c, instr->src[0]);
1412    bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
1413    struct ureg_dst out;
1414    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1415    uint32_t frac = nir_intrinsic_component(instr);
1416 
1417    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1418       if (semantics.location == FRAG_RESULT_COLOR)
1419          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
1420 
1421       unsigned semantic_name, semantic_index;
1422       tgsi_get_gl_frag_result_semantic(semantics.location,
1423                                        &semantic_name, &semantic_index);
1424       semantic_index += semantics.dual_source_blend_index;
1425 
1426       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
1427 
1428       switch (semantics.location) {
1429       case FRAG_RESULT_DEPTH:
1430          frac = 2; /* z write is the to the .z channel in TGSI */
1431          break;
1432       case FRAG_RESULT_STENCIL:
1433          frac = 1;
1434          break;
1435       default:
1436          break;
1437       }
1438    } else {
1439       unsigned semantic_name, semantic_index;
1440 
1441       ntt_get_gl_varying_semantic(c, semantics.location,
1442                                   &semantic_name, &semantic_index);
1443 
1444       uint32_t usage_mask = ntt_tgsi_usage_mask(frac,
1445                                                 instr->num_components,
1446                                                 is_64);
1447       uint32_t gs_streams = semantics.gs_streams;
1448       for (int i = 0; i < 4; i++) {
1449          if (!(usage_mask & (1 << i)))
1450             gs_streams &= ~(0x3 << 2 * i);
1451       }
1452 
1453       /* XXX: array_id is used in svga tess. */
1454       unsigned array_id = 0;
1455 
1456       /* This bit is lost in the i/o semantics, but it's unused in in-tree
1457        * drivers.
1458        */
1459       bool invariant = false;
1460 
1461       out = ureg_DECL_output_layout(c->ureg,
1462                                     semantic_name, semantic_index,
1463                                     gs_streams,
1464                                     base,
1465                                     usage_mask,
1466                                     array_id,
1467                                     semantics.num_slots,
1468                                     invariant);
1469    }
1470 
1471    out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1472 
1473    unsigned write_mask = nir_intrinsic_write_mask(instr);
1474 
1475    if (is_64) {
1476       write_mask = ntt_64bit_write_mask(write_mask);
1477       if (frac >= 2)
1478          write_mask = write_mask << 2;
1479    } else {
1480       write_mask = write_mask << frac;
1481    }
1482 
1483    uint8_t swizzle[4] = { 0, 0, 0, 0 };
1484    for (int i = frac; i <= 4; i++) {
1485       if (write_mask & (1 << i))
1486          swizzle[i] = i - frac;
1487    }
1488 
1489    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1490    out = ureg_writemask(out, write_mask);
1491 
1492    ureg_MOV(c->ureg, out, src);
1493    ntt_reladdr_dst_put(c, out);
1494 }
1495 
1496 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)1497 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
1498 {
1499    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1500    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1501    ntt_store(c, &instr->dest, ureg_DECL_system_value(c->ureg, semantic, 0));
1502 }
1503 
1504 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)1505 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
1506 {
1507    switch (instr->intrinsic) {
1508    case nir_intrinsic_load_uniform:
1509       ntt_emit_load_uniform(c, instr);
1510       break;
1511 
1512    case nir_intrinsic_load_ubo:
1513       ntt_emit_load_ubo(c, instr);
1514       break;
1515 
1516    case nir_intrinsic_load_ubo_vec4:
1517       ntt_emit_load_ubo_vec4(c, instr);
1518       break;
1519 
1520       /* Vertex */
1521    case nir_intrinsic_load_vertex_id:
1522    case nir_intrinsic_load_vertex_id_zero_base:
1523    case nir_intrinsic_load_base_vertex:
1524    case nir_intrinsic_load_base_instance:
1525    case nir_intrinsic_load_instance_id:
1526    case nir_intrinsic_load_draw_id:
1527    case nir_intrinsic_load_invocation_id:
1528    case nir_intrinsic_load_frag_coord:
1529    case nir_intrinsic_load_point_coord:
1530    case nir_intrinsic_load_front_face:
1531    case nir_intrinsic_load_sample_id:
1532    case nir_intrinsic_load_sample_mask_in:
1533    case nir_intrinsic_load_helper_invocation:
1534    case nir_intrinsic_load_tess_coord:
1535    case nir_intrinsic_load_patch_vertices_in:
1536    case nir_intrinsic_load_primitive_id:
1537    case nir_intrinsic_load_tess_level_outer:
1538    case nir_intrinsic_load_tess_level_inner:
1539    case nir_intrinsic_load_local_invocation_id:
1540    case nir_intrinsic_load_work_group_id:
1541    case nir_intrinsic_load_num_work_groups:
1542    case nir_intrinsic_load_local_group_size:
1543    case nir_intrinsic_load_subgroup_size:
1544    case nir_intrinsic_load_subgroup_invocation:
1545    case nir_intrinsic_load_subgroup_eq_mask:
1546    case nir_intrinsic_load_subgroup_ge_mask:
1547    case nir_intrinsic_load_subgroup_gt_mask:
1548    case nir_intrinsic_load_subgroup_lt_mask:
1549       ntt_emit_load_sysval(c, instr);
1550       break;
1551 
1552    case nir_intrinsic_load_input:
1553    case nir_intrinsic_load_per_vertex_input:
1554    case nir_intrinsic_load_interpolated_input:
1555       ntt_emit_load_input(c, instr);
1556       break;
1557 
1558    case nir_intrinsic_store_output:
1559       ntt_emit_store_output(c, instr);
1560       break;
1561 
1562    case nir_intrinsic_discard:
1563       ureg_KILL(c->ureg);
1564       break;
1565 
1566    case nir_intrinsic_discard_if: {
1567       struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
1568 
1569       if (c->native_integers) {
1570          struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
1571          ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
1572          ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
1573          ureg_release_temporary(c->ureg, temp);
1574       } else {
1575          /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1576          ureg_KILL_IF(c->ureg, ureg_negate(cond));
1577       }
1578       break;
1579    }
1580 
1581    case nir_intrinsic_load_ssbo:
1582    case nir_intrinsic_store_ssbo:
1583    case nir_intrinsic_ssbo_atomic_add:
1584    case nir_intrinsic_ssbo_atomic_fadd:
1585    case nir_intrinsic_ssbo_atomic_imin:
1586    case nir_intrinsic_ssbo_atomic_imax:
1587    case nir_intrinsic_ssbo_atomic_umin:
1588    case nir_intrinsic_ssbo_atomic_umax:
1589    case nir_intrinsic_ssbo_atomic_and:
1590    case nir_intrinsic_ssbo_atomic_or:
1591    case nir_intrinsic_ssbo_atomic_xor:
1592    case nir_intrinsic_ssbo_atomic_exchange:
1593    case nir_intrinsic_ssbo_atomic_comp_swap:
1594    case nir_intrinsic_get_ssbo_size:
1595       ntt_emit_mem(c, instr, nir_var_mem_ssbo);
1596       break;
1597 
1598    case nir_intrinsic_load_shared:
1599    case nir_intrinsic_store_shared:
1600    case nir_intrinsic_shared_atomic_add:
1601    case nir_intrinsic_shared_atomic_fadd:
1602    case nir_intrinsic_shared_atomic_imin:
1603    case nir_intrinsic_shared_atomic_imax:
1604    case nir_intrinsic_shared_atomic_umin:
1605    case nir_intrinsic_shared_atomic_umax:
1606    case nir_intrinsic_shared_atomic_and:
1607    case nir_intrinsic_shared_atomic_or:
1608    case nir_intrinsic_shared_atomic_xor:
1609    case nir_intrinsic_shared_atomic_exchange:
1610    case nir_intrinsic_shared_atomic_comp_swap:
1611       ntt_emit_mem(c, instr, nir_var_mem_shared);
1612       break;
1613 
1614    case nir_intrinsic_image_load:
1615    case nir_intrinsic_image_store:
1616    case nir_intrinsic_image_size:
1617    case nir_intrinsic_image_atomic_add:
1618    case nir_intrinsic_image_atomic_fadd:
1619    case nir_intrinsic_image_atomic_imin:
1620    case nir_intrinsic_image_atomic_umin:
1621    case nir_intrinsic_image_atomic_imax:
1622    case nir_intrinsic_image_atomic_umax:
1623    case nir_intrinsic_image_atomic_and:
1624    case nir_intrinsic_image_atomic_or:
1625    case nir_intrinsic_image_atomic_xor:
1626    case nir_intrinsic_image_atomic_exchange:
1627    case nir_intrinsic_image_atomic_comp_swap:
1628       ntt_emit_image_load_store(c, instr);
1629       break;
1630 
1631    case nir_intrinsic_control_barrier:
1632       ureg_BARRIER(c->ureg);
1633       break;
1634 
1635    case nir_intrinsic_memory_barrier:
1636       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1637                                       TGSI_MEMBAR_SHADER_BUFFER |
1638                                       TGSI_MEMBAR_ATOMIC_BUFFER |
1639                                       TGSI_MEMBAR_SHADER_IMAGE |
1640                                       TGSI_MEMBAR_SHARED));
1641       break;
1642 
1643    case nir_intrinsic_memory_barrier_atomic_counter:
1644       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
1645       break;
1646 
1647    case nir_intrinsic_memory_barrier_buffer:
1648       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
1649       break;
1650 
1651    case nir_intrinsic_memory_barrier_image:
1652       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
1653       break;
1654 
1655    case nir_intrinsic_memory_barrier_shared:
1656       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
1657       break;
1658 
1659    case nir_intrinsic_group_memory_barrier:
1660       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1661                                       TGSI_MEMBAR_SHADER_BUFFER |
1662                                       TGSI_MEMBAR_ATOMIC_BUFFER |
1663                                       TGSI_MEMBAR_SHADER_IMAGE |
1664                                       TGSI_MEMBAR_SHARED |
1665                                       TGSI_MEMBAR_THREAD_GROUP));
1666       break;
1667 
1668    case nir_intrinsic_end_primitive:
1669       ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1670       break;
1671 
1672    case nir_intrinsic_emit_vertex:
1673       ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1674       break;
1675 
1676       /* In TGSI we don't actually generate the barycentric coords, and emit
1677        * interp intrinsics later.  However, we do need to store the _at_offset
1678        * argument so that we can use it at that point.
1679        */
1680    case nir_intrinsic_load_barycentric_pixel:
1681    case nir_intrinsic_load_barycentric_centroid:
1682    case nir_intrinsic_load_barycentric_at_sample:
1683       break;
1684 
1685    case nir_intrinsic_load_barycentric_at_offset:
1686       ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
1687       break;
1688 
1689    default:
1690       fprintf(stderr, "Unknown intrinsic: ");
1691       nir_print_instr(&instr->instr, stderr);
1692       fprintf(stderr, "\n");
1693       break;
1694    }
1695 }
1696 
1697 struct ntt_tex_operand_state {
1698    struct ureg_src srcs[4];
1699    unsigned i;
1700    unsigned chan;
1701    bool is_temp[4];
1702 };
1703 
1704 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)1705 ntt_push_tex_arg(struct ntt_compile *c,
1706                  nir_tex_instr *instr,
1707                  nir_tex_src_type tex_src_type,
1708                  struct ntt_tex_operand_state *s)
1709 {
1710    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1711    if (tex_src < 0)
1712       return;
1713 
1714    struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);
1715    int num_components = nir_tex_instr_src_size(instr, tex_src);
1716 
1717    /* Find which src in the tex args we'll fit in. */
1718    if (s->chan + num_components > 4) {
1719       s->chan = 0;
1720       s->i++;
1721    }
1722 
1723    /* Would need to fix up swizzling up to the writemask channel here. */
1724    assert(num_components == 1 || s->chan == 0);
1725    if (num_components == 1)
1726       src = ureg_scalar(src, 0);
1727 
1728    if (ureg_src_is_undef(s->srcs[s->i])) {
1729       /* First emit of a tex operand's components, no need for a mov. */
1730       s->srcs[s->i] = src;
1731    } else {
1732       /* Otherwise, we need to have a temporary for all the components that go
1733        * in this operand.
1734        */
1735       if (!s->is_temp[s->i]) {
1736          struct ureg_src prev_src = s->srcs[s->i];
1737          s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));
1738          s->is_temp[s->i] = true;
1739 
1740          ureg_MOV(c->ureg,
1741                   ureg_writemask(ureg_dst(s->srcs[s->i]),
1742                                  BITFIELD_MASK(s->chan)), prev_src);
1743       }
1744 
1745       ureg_MOV(c->ureg,
1746                ureg_writemask(ureg_dst(s->srcs[s->i]),
1747                               BITFIELD_RANGE(s->chan, num_components)),
1748                src);
1749    }
1750 
1751    s->chan += num_components;
1752 }
1753 
1754 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)1755 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
1756 {
1757    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1758    unsigned target;
1759    unsigned tex_opcode;
1760 
1761    struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1762    int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1763    if (sampler_src >= 0) {
1764       struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
1765       sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
1766    }
1767 
1768    switch (instr->op) {
1769    case nir_texop_tex:
1770       tex_opcode = TGSI_OPCODE_TEX;
1771       break;
1772    case nir_texop_txf:
1773    case nir_texop_txf_ms:
1774       /* XXX: Support txf_lz */
1775       tex_opcode = TGSI_OPCODE_TXF;
1776       break;
1777    case nir_texop_txl:
1778       tex_opcode = TGSI_OPCODE_TXL;
1779       break;
1780    case nir_texop_txb:
1781       tex_opcode = TGSI_OPCODE_TXB;
1782       break;
1783    case nir_texop_txd:
1784       tex_opcode = TGSI_OPCODE_TXD;
1785       break;
1786    case nir_texop_txs:
1787       tex_opcode = TGSI_OPCODE_TXQ;
1788       break;
1789    case nir_texop_tg4:
1790       tex_opcode = TGSI_OPCODE_TG4;
1791       break;
1792    case nir_texop_query_levels:
1793       tex_opcode = TGSI_OPCODE_TXQ;
1794       break;
1795    case nir_texop_lod:
1796       tex_opcode = TGSI_OPCODE_LODQ;
1797       break;
1798    case nir_texop_texture_samples:
1799       tex_opcode = TGSI_OPCODE_TXQS;
1800       break;
1801    default:
1802       unreachable("unsupported tex op");
1803    }
1804 
1805    struct ntt_tex_operand_state s = { .i = 0 };
1806    ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);
1807    /* We always have at least two slots for the coordinate, even on 1D. */
1808    s.chan = MAX2(s.chan, 2);
1809 
1810    ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);
1811    s.chan = MAX2(s.chan, 3);
1812 
1813    ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);
1814    ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1815 
1816    /* End of packed src setup, everything that follows gets its own operand. */
1817    if (s.chan)
1818       s.i++;
1819 
1820    switch (instr->sampler_dim) {
1821    case GLSL_SAMPLER_DIM_1D:
1822       if (instr->is_array) {
1823          if (instr->is_shadow) {
1824             target = TGSI_TEXTURE_SHADOW1D_ARRAY;
1825          } else {
1826             target = TGSI_TEXTURE_1D_ARRAY;
1827          }
1828       } else {
1829          if (instr->is_shadow) {
1830             target = TGSI_TEXTURE_SHADOW1D;
1831          } else {
1832             target = TGSI_TEXTURE_1D;
1833          }
1834       }
1835       break;
1836    case GLSL_SAMPLER_DIM_2D:
1837    case GLSL_SAMPLER_DIM_EXTERNAL:
1838       if (instr->is_array) {
1839          if (instr->is_shadow) {
1840             target = TGSI_TEXTURE_SHADOW2D_ARRAY;
1841          } else {
1842             target = TGSI_TEXTURE_2D_ARRAY;
1843          }
1844       } else {
1845          if (instr->is_shadow) {
1846             target = TGSI_TEXTURE_SHADOW2D;
1847          } else {
1848             target = TGSI_TEXTURE_2D;
1849          }
1850       }
1851       break;
1852    case GLSL_SAMPLER_DIM_MS:
1853       if (instr->is_array) {
1854          target = TGSI_TEXTURE_2D_ARRAY_MSAA;
1855       } else {
1856          target = TGSI_TEXTURE_2D_ARRAY;
1857       }
1858       break;
1859    case GLSL_SAMPLER_DIM_3D:
1860       assert(!instr->is_shadow);
1861       target = TGSI_TEXTURE_3D;
1862       break;
1863    case GLSL_SAMPLER_DIM_RECT:
1864       if (instr->is_shadow) {
1865          target = TGSI_TEXTURE_SHADOWRECT;
1866       } else {
1867          target = TGSI_TEXTURE_RECT;
1868       }
1869       break;
1870    case GLSL_SAMPLER_DIM_CUBE:
1871       if (instr->is_array) {
1872          if (instr->is_shadow) {
1873             target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
1874          } else {
1875             target = TGSI_TEXTURE_CUBE_ARRAY;
1876          }
1877       } else {
1878          if (instr->is_shadow) {
1879             target = TGSI_TEXTURE_SHADOWCUBE;
1880          } else {
1881             target = TGSI_TEXTURE_CUBE;
1882          }
1883       }
1884       break;
1885    case GLSL_SAMPLER_DIM_BUF:
1886       target = TGSI_TEXTURE_BUFFER;
1887       break;
1888    default:
1889       fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
1890       abort();
1891    }
1892 
1893    if (s.i > 1) {
1894       if (tex_opcode == TGSI_OPCODE_TEX)
1895          tex_opcode = TGSI_OPCODE_TEX2;
1896       if (tex_opcode == TGSI_OPCODE_TXB)
1897          tex_opcode = TGSI_OPCODE_TXB2;
1898       if (tex_opcode == TGSI_OPCODE_TXL)
1899          tex_opcode = TGSI_OPCODE_TXL2;
1900    }
1901 
1902    if (instr->op == nir_texop_txd) {
1903       /* Derivs appear in their own src args */
1904       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
1905       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1906       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
1907       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
1908    }
1909 
1910    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1911       if (c->screen->get_param(c->screen,
1912                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
1913          sampler = ureg_scalar(sampler, instr->component);
1914          s.srcs[s.i++] = ureg_src_undef();
1915       } else {
1916          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
1917       }
1918    }
1919 
1920    s.srcs[s.i++] = sampler;
1921 
1922    enum tgsi_return_type tex_type;
1923    switch (instr->dest_type) {
1924    case nir_type_float:
1925       tex_type = TGSI_RETURN_TYPE_FLOAT;
1926       break;
1927    case nir_type_int:
1928       tex_type = TGSI_RETURN_TYPE_SINT;
1929       break;
1930    case nir_type_uint:
1931       tex_type = TGSI_RETURN_TYPE_UINT;
1932       break;
1933    default:
1934       unreachable("unknown texture type");
1935    }
1936 
1937    struct tgsi_texture_offset tex_offsets[4];
1938    unsigned num_tex_offsets = 0;
1939    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
1940    if (tex_offset_src >= 0) {
1941       struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
1942 
1943       tex_offsets[0].File = offset.File;
1944       tex_offsets[0].Index = offset.Index;
1945       tex_offsets[0].SwizzleX = offset.SwizzleX;
1946       tex_offsets[0].SwizzleY = offset.SwizzleY;
1947       tex_offsets[0].SwizzleZ = offset.SwizzleZ;
1948       tex_offsets[0].Padding = 0;
1949 
1950       num_tex_offsets = 1;
1951    }
1952 
1953    struct ureg_dst tex_dst;
1954    if (instr->op == nir_texop_query_levels)
1955       tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
1956    else
1957       tex_dst = dst;
1958 
1959    ureg_tex_insn(c->ureg, tex_opcode,
1960                  &tex_dst, 1,
1961                  target,
1962                  tex_type,
1963                  tex_offsets, num_tex_offsets,
1964                  s.srcs, s.i);
1965 
1966    if (instr->op == nir_texop_query_levels) {
1967       ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
1968       ureg_release_temporary(c->ureg, tex_dst);
1969    }
1970 
1971    for (int i = 0; i < s.i; i++) {
1972       if (s.is_temp[i])
1973          ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));
1974    }
1975 }
1976 
1977 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)1978 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
1979 {
1980    switch (jump->type) {
1981    case nir_jump_break:
1982       ureg_BRK(c->ureg);
1983       break;
1984 
1985    case nir_jump_continue:
1986       ureg_CONT(c->ureg);
1987       break;
1988 
1989    default:
1990       fprintf(stderr, "Unknown jump instruction: ");
1991       nir_print_instr(&jump->instr, stderr);
1992       fprintf(stderr, "\n");
1993       abort();
1994    }
1995 }
1996 
1997 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)1998 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
1999 {
2000    /* Nothing to do but make sure that we have some storage to deref. */
2001    (void)ntt_get_ssa_def_decl(c, &instr->def);
2002 }
2003 
2004 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2005 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2006 {
2007    /* There is no addr reg in use before we start emitting an instr. */
2008    c->next_addr_reg = 0;
2009 
2010    switch (instr->type) {
2011    case nir_instr_type_deref:
2012       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2013       break;
2014 
2015    case nir_instr_type_alu:
2016       ntt_emit_alu(c, nir_instr_as_alu(instr));
2017       break;
2018 
2019    case nir_instr_type_intrinsic:
2020       ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2021       break;
2022 
2023    case nir_instr_type_load_const:
2024       /* Nothing to do here, as load consts are done directly from
2025        * ntt_get_src() (since many constant NIR srcs will often get folded
2026        * directly into a register file index instead of as a TGSI src).
2027        */
2028       break;
2029 
2030    case nir_instr_type_tex:
2031       ntt_emit_texture(c, nir_instr_as_tex(instr));
2032       break;
2033 
2034    case nir_instr_type_jump:
2035       ntt_emit_jump(c, nir_instr_as_jump(instr));
2036       break;
2037 
2038    case nir_instr_type_ssa_undef:
2039       ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2040       break;
2041 
2042    default:
2043       fprintf(stderr, "Unknown NIR instr type: ");
2044       nir_print_instr(instr, stderr);
2045       fprintf(stderr, "\n");
2046       abort();
2047    }
2048 }
2049 
2050 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2051 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2052 {
2053    unsigned label;
2054    ureg_UIF(c->ureg, c->if_cond, &label);
2055    ntt_emit_cf_list(c, &if_stmt->then_list);
2056 
2057    if (!exec_list_is_empty(&if_stmt->else_list)) {
2058       ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2059       ureg_ELSE(c->ureg, &label);
2060       ntt_emit_cf_list(c, &if_stmt->else_list);
2061    }
2062 
2063    ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2064    ureg_ENDIF(c->ureg);
2065 }
2066 
2067 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2068 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2069 {
2070    unsigned last_loop_label = c->loop_label;
2071 
2072    unsigned begin_label;
2073    ureg_BGNLOOP(c->ureg, &begin_label);
2074    ntt_emit_cf_list(c, &loop->body);
2075 
2076    /* XXX: Need to set cont/break labels for svga, nv30, nv50.
2077     *
2078     * ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2079     */
2080    unsigned end_label;
2081    ureg_ENDLOOP(c->ureg, &end_label);
2082 
2083    c->loop_label = last_loop_label;
2084 }
2085 
2086 static void
ntt_free_ssa_temp_by_index(struct ntt_compile * c,int index)2087 ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
2088 {
2089    /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
2090    if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
2091       return;
2092 
2093    ureg_release_temporary(c->ureg, c->ssa_temp[index]);
2094    memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
2095 }
2096 
2097 /* Releases any temporaries for SSA defs with a live interval ending at this
2098  * instruction.
2099  */
2100 static bool
ntt_src_live_interval_end_cb(nir_src * src,void * state)2101 ntt_src_live_interval_end_cb(nir_src *src, void *state)
2102 {
2103    struct ntt_compile *c = state;
2104 
2105    if (src->is_ssa) {
2106       nir_ssa_def *def = src->ssa;
2107 
2108       if (c->liveness->defs[def->index].end == src->parent_instr->index)
2109          ntt_free_ssa_temp_by_index(c, def->index);
2110    }
2111 
2112    return true;
2113 }
2114 
2115 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2116 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2117 {
2118    nir_foreach_instr(instr, block) {
2119       ntt_emit_instr(c, instr);
2120 
2121       nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
2122    }
2123 
2124    /* Set up the if condition for ntt_emit_if(), which we have to do before
2125     * freeing up the temps (the "if" is treated as inside the block for liveness
2126     * purposes, despite not being an instruction)
2127     */
2128    nir_if *nif = nir_block_get_following_if(block);
2129    if (nif)
2130       c->if_cond = ntt_get_src(c, nif->condition);
2131 
2132    /* Free up any SSA temps that are unused at the end of the block. */
2133    unsigned index;
2134    BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
2135       unsigned def_end_ip = c->liveness->defs[index].end;
2136       if (def_end_ip == block->end_ip)
2137          ntt_free_ssa_temp_by_index(c, index);
2138    }
2139 }
2140 
2141 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2142 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2143 {
2144    /* There is no addr reg in use before we start emitting any part of a CF
2145     * node (such as an if condition)
2146     */
2147    c->next_addr_reg = 0;
2148 
2149    foreach_list_typed(nir_cf_node, node, node, list) {
2150       switch (node->type) {
2151       case nir_cf_node_block:
2152          ntt_emit_block(c, nir_cf_node_as_block(node));
2153          break;
2154 
2155       case nir_cf_node_if:
2156          ntt_emit_if(c, nir_cf_node_as_if(node));
2157          break;
2158 
2159       case nir_cf_node_loop:
2160          ntt_emit_loop(c, nir_cf_node_as_loop(node));
2161          break;
2162 
2163       default:
2164          unreachable("unknown CF type");
2165       }
2166    }
2167 }
2168 
2169 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)2170 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
2171 {
2172    /* reindex values so the numbers are reasonably small despite
2173     * optimization having deleted most of them.
2174     */
2175    nir_index_ssa_defs(impl);
2176    nir_index_local_regs(impl);
2177 
2178    nir_index_instrs(impl);
2179 
2180    c->impl = impl;
2181    c->liveness = nir_live_ssa_defs_per_instr(impl);
2182 
2183    c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
2184    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
2185 
2186    ntt_setup_registers(c, &impl->registers);
2187    ntt_emit_cf_list(c, &impl->body);
2188 
2189    ralloc_free(c->liveness);
2190    c->liveness = NULL;
2191 }
2192 
2193 static int
type_size(const struct glsl_type * type,bool bindless)2194 type_size(const struct glsl_type *type, bool bindless)
2195 {
2196    return glsl_count_attribute_slots(type, false);
2197 }
2198 
2199 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
2200  * can handle for 64-bit values in TGSI.
2201  */
2202 static bool
ntt_should_vectorize_instr(const nir_instr * in_a,const nir_instr * in_b,void * data)2203 ntt_should_vectorize_instr(const nir_instr *in_a, const nir_instr *in_b,
2204                            void *data)
2205 {
2206    if (in_a->type != nir_instr_type_alu)
2207       return false;
2208 
2209    nir_alu_instr *a = nir_instr_as_alu(in_a);
2210    nir_alu_instr *b = nir_instr_as_alu(in_b);
2211 
2212    unsigned a_num_components = a->dest.dest.ssa.num_components;
2213    unsigned b_num_components = b->dest.dest.ssa.num_components;
2214 
2215    int src_bit_size = nir_src_bit_size(a->src[0].src);
2216    int dst_bit_size = nir_dest_bit_size(a->dest.dest);
2217 
2218    if (src_bit_size == 64 || dst_bit_size == 64) {
2219       if (a_num_components + b_num_components > 2)
2220          return false;
2221    }
2222 
2223    return true;
2224 }
2225 
2226 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high)2227 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
2228                         unsigned num_components, unsigned high_offset,
2229                         nir_intrinsic_instr *low, nir_intrinsic_instr *high)
2230 {
2231    if (bit_size != 32)
2232       return false;
2233 
2234    /* Our offset alignment should aways be at least 4 bytes */
2235    if (align < 4)
2236       return false;
2237 
2238    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
2239     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
2240     * need this restriction.
2241     */
2242    unsigned worst_start_component = align == 4 ? 3 : align / 4;
2243    if (worst_start_component + num_components > 4)
2244       return false;
2245 
2246    return true;
2247 }
2248 
2249 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2250 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2251 {
2252    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2253    unsigned indirect_mask = 0;
2254 
2255    if (!screen->get_shader_param(screen, pipe_stage,
2256                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2257       indirect_mask |= nir_var_shader_in;
2258    }
2259 
2260    if (!screen->get_shader_param(screen, pipe_stage,
2261                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2262       indirect_mask |= nir_var_shader_out;
2263    }
2264 
2265    if (!screen->get_shader_param(screen, pipe_stage,
2266                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2267       indirect_mask |= nir_var_function_temp;
2268    }
2269 
2270    return indirect_mask;
2271 }
2272 
2273 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)2274 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
2275 {
2276    bool progress;
2277    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2278    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2279    unsigned control_flow_depth =
2280       screen->get_shader_param(screen, pipe_stage,
2281                                PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
2282    do {
2283       progress = false;
2284 
2285       NIR_PASS_V(s, nir_lower_vars_to_ssa);
2286 
2287       NIR_PASS(progress, s, nir_copy_prop);
2288       NIR_PASS(progress, s, nir_opt_algebraic);
2289       NIR_PASS(progress, s, nir_opt_remove_phis);
2290       NIR_PASS(progress, s, nir_opt_conditional_discard);
2291       NIR_PASS(progress, s, nir_opt_dce);
2292       NIR_PASS(progress, s, nir_opt_dead_cf);
2293       NIR_PASS(progress, s, nir_opt_cse);
2294       NIR_PASS(progress, s, nir_opt_find_array_copies);
2295       NIR_PASS(progress, s, nir_opt_if, true);
2296       NIR_PASS(progress, s, nir_opt_peephole_select,
2297                control_flow_depth == 0 ? ~0 : 8, true, true);
2298       NIR_PASS(progress, s, nir_opt_algebraic);
2299       NIR_PASS(progress, s, nir_opt_constant_folding);
2300       NIR_PASS(progress, s, nir_opt_load_store_vectorize, nir_var_mem_ubo,
2301                ntt_should_vectorize_io, 0);
2302       NIR_PASS(progress, s, nir_opt_shrink_vectors);
2303       NIR_PASS(progress, s, nir_opt_trivial_continues);
2304       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
2305       NIR_PASS(progress, s, nir_opt_undef);
2306       NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);
2307 
2308    } while (progress);
2309 }
2310 
2311 /* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
2312  * scalarize vec3/vec4s, should probably fix that.
2313  */
2314 static bool
scalarize_64bit(const nir_instr * instr,const void * data)2315 scalarize_64bit(const nir_instr *instr, const void *data)
2316 {
2317    const nir_alu_instr *alu = nir_instr_as_alu(instr);
2318 
2319    return (nir_dest_bit_size(alu->dest.dest) == 64 ||
2320            nir_src_bit_size(alu->src[0].src) == 64);
2321 }
2322 
2323 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)2324 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
2325 {
2326    b->cursor = nir_after_instr(&instr->instr);
2327 
2328    switch (instr->intrinsic) {
2329    case nir_intrinsic_load_uniform:
2330    case nir_intrinsic_load_ubo:
2331    case nir_intrinsic_load_ubo_vec4:
2332    case nir_intrinsic_load_ssbo:
2333    case nir_intrinsic_load_input:
2334    case nir_intrinsic_load_interpolated_input:
2335    case nir_intrinsic_load_per_vertex_input:
2336    case nir_intrinsic_store_output:
2337    case nir_intrinsic_store_ssbo:
2338       break;
2339    default:
2340       return false;
2341    }
2342 
2343    if (instr->num_components <= 2)
2344       return false;
2345 
2346    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
2347    if (has_dest) {
2348       if (nir_dest_bit_size(instr->dest) != 64)
2349          return false;
2350    } else  {
2351       if (nir_src_bit_size(instr->src[0]) != 64)
2352           return false;
2353    }
2354 
2355    nir_intrinsic_instr *first =
2356       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2357    nir_intrinsic_instr *second =
2358       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2359 
2360    switch (instr->intrinsic) {
2361    case nir_intrinsic_load_uniform:
2362       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2363       break;
2364 
2365    case nir_intrinsic_load_ubo:
2366    case nir_intrinsic_load_ubo_vec4:
2367    case nir_intrinsic_load_ssbo:
2368    case nir_intrinsic_store_ssbo:
2369       break;
2370 
2371    default: {
2372       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
2373       semantics.location++;
2374       semantics.num_slots--;
2375       nir_intrinsic_set_io_semantics(second, semantics);
2376 
2377       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2378       break;
2379    }
2380    }
2381 
2382    first->num_components = 2;
2383    second->num_components -= 2;
2384    if (has_dest) {
2385       first->dest.ssa.num_components = 2;
2386       second->dest.ssa.num_components -= 2;
2387    }
2388 
2389    nir_builder_instr_insert(b, &first->instr);
2390    nir_builder_instr_insert(b, &second->instr);
2391 
2392    if (has_dest) {
2393       /* Merge the two loads' results back into a vector. */
2394       nir_ssa_def *channels[4] = {
2395          nir_channel(b, &first->dest.ssa, 0),
2396          nir_channel(b, &first->dest.ssa, 1),
2397          nir_channel(b, &second->dest.ssa, 0),
2398          second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
2399       };
2400       nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
2401       nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(new));
2402    } else {
2403       /* Split the src value across the two stores. */
2404       b->cursor = nir_before_instr(&instr->instr);
2405 
2406       nir_ssa_def *src0 = instr->src[0].ssa;
2407       nir_ssa_def *channels[4] = { 0 };
2408       for (int i = 0; i < instr->num_components; i++)
2409          channels[i] = nir_channel(b, src0, i);
2410 
2411       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
2412       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
2413 
2414       nir_instr_rewrite_src(&first->instr, &first->src[0],
2415                             nir_src_for_ssa(nir_vec(b, channels, 2)));
2416       nir_instr_rewrite_src(&second->instr, &second->src[0],
2417                             nir_src_for_ssa(nir_vec(b, &channels[2],
2418                                                     second->num_components)));
2419    }
2420 
2421    int offset_src = -1;
2422    uint32_t offset_amount = 16;
2423 
2424    switch (instr->intrinsic) {
2425    case nir_intrinsic_load_ssbo:
2426    case nir_intrinsic_load_ubo:
2427       offset_src = 1;
2428       break;
2429    case nir_intrinsic_load_ubo_vec4:
2430       offset_src = 1;
2431       offset_amount = 1;
2432       break;
2433    case nir_intrinsic_store_ssbo:
2434       offset_src = 2;
2435       break;
2436    default:
2437       break;
2438    }
2439    if (offset_src != -1) {
2440       b->cursor = nir_before_instr(&second->instr);
2441       nir_ssa_def *second_offset =
2442          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
2443       nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
2444                             nir_src_for_ssa(second_offset));
2445    }
2446 
2447    /* DCE stores we generated with no writemask (nothing else does this
2448     * currently).
2449     */
2450    if (!has_dest) {
2451       if (nir_intrinsic_write_mask(first) == 0)
2452          nir_instr_remove(&first->instr);
2453       if (nir_intrinsic_write_mask(second) == 0)
2454          nir_instr_remove(&second->instr);
2455    }
2456 
2457    nir_instr_remove(&instr->instr);
2458 
2459    return true;
2460 }
2461 
2462 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)2463 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
2464 {
2465    int num_components = instr->def.num_components;
2466 
2467    if (instr->def.bit_size != 64 || num_components <= 2)
2468       return false;
2469 
2470    b->cursor = nir_before_instr(&instr->instr);
2471 
2472    nir_load_const_instr *first =
2473       nir_load_const_instr_create(b->shader, 2, 64);
2474    nir_load_const_instr *second =
2475       nir_load_const_instr_create(b->shader, num_components - 2, 64);
2476 
2477    first->value[0] = instr->value[0];
2478    first->value[1] = instr->value[1];
2479    second->value[0] = instr->value[2];
2480    if (num_components == 4)
2481       second->value[1] = instr->value[3];
2482 
2483    nir_builder_instr_insert(b, &first->instr);
2484    nir_builder_instr_insert(b, &second->instr);
2485 
2486    nir_ssa_def *channels[4] = {
2487       nir_channel(b, &first->def, 0),
2488       nir_channel(b, &first->def, 1),
2489       nir_channel(b, &second->def, 0),
2490       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
2491    };
2492    nir_ssa_def *new = nir_vec(b, channels, num_components);
2493    nir_ssa_def_rewrite_uses(&instr->def, nir_src_for_ssa(new));
2494    nir_instr_remove(&instr->instr);
2495 
2496    return true;
2497 }
2498 
2499 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)2500 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
2501                                       void *data)
2502 {
2503    switch (instr->type) {
2504    case nir_instr_type_load_const:
2505       return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
2506 
2507    case nir_instr_type_intrinsic:
2508       return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
2509    default:
2510       return false;
2511    }
2512 }
2513 
2514 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)2515 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
2516 {
2517    return nir_shader_instructions_pass(s,
2518                                        nir_to_tgsi_lower_64bit_to_vec2_instr,
2519                                        nir_metadata_block_index |
2520                                        nir_metadata_dominance,
2521                                        NULL);
2522 }
2523 
2524 static void
ntt_sanity_check_driver_options(struct nir_shader * s)2525 ntt_sanity_check_driver_options(struct nir_shader *s)
2526 {
2527    UNUSED const struct nir_shader_compiler_options *options = s->options;
2528 
2529    assert(options->lower_extract_byte);
2530    assert(options->lower_extract_word);
2531    assert(options->lower_fdph);
2532    assert(options->lower_flrp64);
2533    assert(options->lower_fmod);
2534    assert(options->lower_rotate);
2535    assert(options->lower_vector_cmp);
2536 }
2537 
2538 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)2539 nir_to_tgsi(struct nir_shader *s,
2540             struct pipe_screen *screen)
2541 {
2542    struct ntt_compile *c;
2543    const void *tgsi_tokens;
2544    bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
2545    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2546    bool native_integers = screen->get_shader_param(screen,
2547                                                    pipe_shader_type_from_mesa(s->info.stage),
2548                                                    PIPE_SHADER_CAP_INTEGERS);
2549 
2550    ntt_sanity_check_driver_options(s);
2551 
2552    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2553               type_size, (nir_lower_io_options)0);
2554    NIR_PASS_V(s, nir_lower_regs_to_ssa);
2555 
2556    const nir_lower_tex_options lower_tex_options = {
2557       /* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.
2558        */
2559       .lower_txp = ~0,
2560    };
2561    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
2562 
2563    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
2564     * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
2565     * duplication logic we just make it so that we only see vec2s.
2566     */
2567    NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
2568    NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
2569 
2570    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
2571       NIR_PASS_V(s, nir_lower_ubo_vec4);
2572 
2573    ntt_optimize_nir(s, screen);
2574 
2575    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
2576 
2577    bool progress;
2578    do {
2579       progress = false;
2580       NIR_PASS(progress, s, nir_opt_algebraic_late);
2581       if (progress) {
2582          NIR_PASS_V(s, nir_copy_prop);
2583          NIR_PASS_V(s, nir_opt_dce);
2584          NIR_PASS_V(s, nir_opt_cse);
2585       }
2586    } while (progress);
2587 
2588    if (screen->get_shader_param(screen,
2589                                 pipe_shader_type_from_mesa(s->info.stage),
2590                                 PIPE_SHADER_CAP_INTEGERS)) {
2591       NIR_PASS_V(s, nir_lower_bool_to_int32);
2592    } else {
2593       NIR_PASS_V(s, nir_lower_int_to_float);
2594       NIR_PASS_V(s, nir_lower_bool_to_float);
2595    }
2596 
2597    NIR_PASS_V(s, nir_lower_to_source_mods,
2598               nir_lower_float_source_mods |
2599               nir_lower_int_source_mods); /* no doubles */
2600    NIR_PASS_V(s, nir_convert_from_ssa, true);
2601    NIR_PASS_V(s, nir_lower_vec_to_movs);
2602 
2603    /* locals_to_regs will leave dead derefs that are good to clean up. */
2604    NIR_PASS_V(s, nir_lower_locals_to_regs);
2605    NIR_PASS_V(s, nir_opt_dce);
2606 
2607    if (debug) {
2608       fprintf(stderr, "NIR before translation to TGSI:\n");
2609       nir_print_shader(s, stderr);
2610    }
2611 
2612    c = rzalloc(NULL, struct ntt_compile);
2613    c->screen = screen;
2614 
2615    c->needs_texcoord_semantic =
2616       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
2617    c->any_reg_as_address =
2618       screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
2619 
2620    c->s = s;
2621    c->native_integers = native_integers;
2622    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2623    ureg_setup_shader_info(c->ureg, &s->info);
2624 
2625    ntt_setup_inputs(c);
2626    ntt_setup_uniforms(c);
2627 
2628    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2629       /* The draw module's polygon stipple layer doesn't respect the chosen
2630        * coordinate mode, so leave it as unspecified unless we're actually
2631        * reading the position in the shader already.  See
2632        * gl-2.1-polygon-stipple-fs on softpipe.
2633        */
2634       if ((s->info.inputs_read & VARYING_BIT_POS) ||
2635           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2636          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2637                        s->info.fs.origin_upper_left ?
2638                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
2639                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2640 
2641          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2642                        s->info.fs.pixel_center_integer ?
2643                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
2644                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2645       }
2646    }
2647    /* Emit the main function */
2648    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2649    ntt_emit_impl(c, impl);
2650    ureg_END(c->ureg);
2651 
2652    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2653 
2654    if (debug) {
2655       fprintf(stderr, "TGSI after translation from NIR:\n");
2656       tgsi_dump(tgsi_tokens, 0);
2657    }
2658 
2659    ureg_destroy(c->ureg);
2660 
2661    ralloc_free(c);
2662 
2663    return tgsi_tokens;
2664 }
2665 
2666 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
2667    .fuse_ffma32 = true,
2668    .fuse_ffma64 = true,
2669    .lower_extract_byte = true,
2670    .lower_extract_word = true,
2671    .lower_fdph = true,
2672    .lower_flrp64 = true,
2673    .lower_fmod = true,
2674    .lower_rotate = true,
2675    .lower_sub = true,
2676    .lower_vector_cmp = true,
2677    .use_interpolated_input_intrinsics = true,
2678 };
2679 
2680 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
2681  * NIR support.
2682  */
2683 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)2684 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
2685                                  enum pipe_shader_ir ir,
2686                                  unsigned shader)
2687 {
2688    assert(ir == PIPE_SHADER_IR_NIR);
2689    return &nir_to_tgsi_compiler_options;
2690 }
2691