1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28 
29 namespace brw {
30 
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                    const src_reg &src0, const src_reg &src1,
33                                    const src_reg &src2)
34 {
35    this->opcode = opcode;
36    this->dst = dst;
37    this->src[0] = src0;
38    this->src[1] = src1;
39    this->src[2] = src2;
40    this->saturate = false;
41    this->force_writemask_all = false;
42    this->no_dd_clear = false;
43    this->no_dd_check = false;
44    this->writes_accumulator = false;
45    this->conditional_mod = BRW_CONDITIONAL_NONE;
46    this->predicate = BRW_PREDICATE_NONE;
47    this->predicate_inverse = false;
48    this->target = 0;
49    this->shadow_compare = false;
50    this->eot = false;
51    this->ir = NULL;
52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53    this->header_size = 0;
54    this->flag_subreg = 0;
55    this->mlen = 0;
56    this->base_mrf = 0;
57    this->offset = 0;
58    this->exec_size = 8;
59    this->group = 0;
60    this->size_written = (dst.file == BAD_FILE ?
61                          0 : this->exec_size * type_sz(dst.type));
62    this->annotation = NULL;
63 }
64 
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68    inst->ir = this->base_ir;
69    inst->annotation = this->current_annotation;
70 
71    this->instructions.push_tail(inst);
72 
73    return inst;
74 }
75 
76 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78                           vec4_instruction *new_inst)
79 {
80    new_inst->ir = inst->ir;
81    new_inst->annotation = inst->annotation;
82 
83    inst->insert_before(block, new_inst);
84 
85    return inst;
86 }
87 
88 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90                    const src_reg &src1, const src_reg &src2)
91 {
92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94 
95 
96 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98                    const src_reg &src1)
99 {
100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102 
103 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108 
109 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114 
115 vec4_instruction *
emit(enum opcode opcode)116 vec4_visitor::emit(enum opcode opcode)
117 {
118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120 
121 #define ALU1(op)							\
122    vec4_instruction *							\
123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124    {									\
125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126    }
127 
128 #define ALU2(op)							\
129    vec4_instruction *							\
130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                     const src_reg &src1)				\
132    {									\
133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
134                                            src0, src1);                 \
135    }
136 
137 #define ALU2_ACC(op)							\
138    vec4_instruction *							\
139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                     const src_reg &src1)				\
141    {									\
142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                        BRW_OPCODE_##op, dst, src0, src1);		\
144       inst->writes_accumulator = true;                                  \
145       return inst;                                                      \
146    }
147 
148 #define ALU3(op)							\
149    vec4_instruction *							\
150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                     const src_reg &src1, const src_reg &src2)		\
152    {									\
153       assert(devinfo->gen >= 6);						\
154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
155 					   src0, src1, src2);		\
156    }
157 
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191 
192 /** Gen4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196    vec4_instruction *inst;
197 
198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199    inst->predicate = predicate;
200 
201    return inst;
202 }
203 
204 /** Gen6 IF with embedded comparison. */
205 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)206 vec4_visitor::IF(src_reg src0, src_reg src1,
207                  enum brw_conditional_mod condition)
208 {
209    assert(devinfo->gen == 6);
210 
211    vec4_instruction *inst;
212 
213    resolve_ud_negate(&src0);
214    resolve_ud_negate(&src1);
215 
216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 					src0, src1);
218    inst->conditional_mod = condition;
219 
220    return inst;
221 }
222 
223 /**
224  * CMP: Sets the low bit of the destination channels with the result
225  * of the comparison, while the upper bits are undefined, and updates
226  * the flag register with the packed 16 bits of the result.
227  */
228 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230                   enum brw_conditional_mod condition)
231 {
232    vec4_instruction *inst;
233 
234    /* Take the instruction:
235     *
236     * CMP null<d> src0<f> src1<f>
237     *
238     * Original gen4 does type conversion to the destination type before
239     * comparison, producing garbage results for floating point comparisons.
240     *
241     * The destination type doesn't matter on newer generations, so we set the
242     * type to match src0 so we can compact the instruction.
243     */
244    dst.type = src0.type;
245 
246    resolve_ud_negate(&src0);
247    resolve_ud_negate(&src1);
248 
249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250    inst->conditional_mod = condition;
251 
252    return inst;
253 }
254 
255 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258    vec4_instruction *inst;
259 
260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
261 					dst, index);
262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
263    inst->mlen = 2;
264 
265    return inst;
266 }
267 
268 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270                             const src_reg &index)
271 {
272    vec4_instruction *inst;
273 
274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
275 					dst, src, index);
276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
277    inst->mlen = 3;
278 
279    return inst;
280 }
281 
282 src_reg
fix_3src_operand(const src_reg & src)283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286     * able to use vertical stride of zero to replicate the vec4 uniform, like
287     *
288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289     *
290     * But you can't, since vertical stride is always four in three-source
291     * instructions. Instead, insert a MOV instruction to do the replication so
292     * that the three-source instruction can consume it.
293     */
294 
295    /* The MOV is only needed if the source is a uniform or immediate. */
296    if (src.file != UNIFORM && src.file != IMM)
297       return src;
298 
299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300       return src;
301 
302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303    expanded.type = src.type;
304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305    return src_reg(expanded);
306 }
307 
308 src_reg
fix_math_operand(const src_reg & src)309 vec4_visitor::fix_math_operand(const src_reg &src)
310 {
311    if (devinfo->gen < 6 || src.file == BAD_FILE)
312       return src;
313 
314    /* The gen6 math instruction ignores the source modifiers --
315     * swizzle, abs, negate, and at least some parts of the register
316     * region description.
317     *
318     * Rather than trying to enumerate all these cases, *always* expand the
319     * operand to a temp GRF for gen6.
320     *
321     * For gen7, keep the operand as-is, except if immediate, which gen7 still
322     * can't use.
323     */
324 
325    if (devinfo->gen == 7 && src.file != IMM)
326       return src;
327 
328    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329    expanded.type = src.type;
330    emit(MOV(expanded, src));
331    return src_reg(expanded);
332 }
333 
334 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)335 vec4_visitor::emit_math(enum opcode opcode,
336                         const dst_reg &dst,
337                         const src_reg &src0, const src_reg &src1)
338 {
339    vec4_instruction *math =
340       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341 
342    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
343       /* MATH on Gen6 must be align1, so we can't do writemasks. */
344       math->dst = dst_reg(this, glsl_type::vec4_type);
345       math->dst.type = dst.type;
346       math = emit(MOV(dst, src_reg(math->dst)));
347    } else if (devinfo->gen < 6) {
348       math->base_mrf = 1;
349       math->mlen = src1.file == BAD_FILE ? 1 : 2;
350    }
351 
352    return math;
353 }
354 
355 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358    if (devinfo->gen < 7) {
359       unreachable("ir_unop_pack_half_2x16 should be lowered");
360    }
361 
362    assert(dst.type == BRW_REGISTER_TYPE_UD);
363    assert(src0.type == BRW_REGISTER_TYPE_F);
364 
365    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366     *
367     *   Because this instruction does not have a 16-bit floating-point type,
368     *   the destination data type must be Word (W).
369     *
370     *   The destination must be DWord-aligned and specify a horizontal stride
371     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
372     *   each destination channel and the upper word is not modified.
373     *
374     * The above restriction implies that the f32to16 instruction must use
375     * align1 mode, because only in align1 mode is it possible to specify
376     * horizontal stride.  We choose here to defy the hardware docs and emit
377     * align16 instructions.
378     *
379     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380     * instructions. I was partially successful in that the code passed all
381     * tests.  However, the code was dubiously correct and fragile, and the
382     * tests were not harsh enough to probe that frailty. Not trusting the
383     * code, I chose instead to remain in align16 mode in defiance of the hw
384     * docs).
385     *
386     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
387     * simulator, emitting a f32to16 in align16 mode with UD as destination
388     * data type is safe. The behavior differs from that specified in the PRM
389     * in that the upper word of each destination channel is cleared to 0.
390     */
391 
392    dst_reg tmp_dst(this, glsl_type::uvec2_type);
393    src_reg tmp_src(tmp_dst);
394 
395 #if 0
396    /* Verify the undocumented behavior on which the following instructions
397     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
398     * then the result of the bit-or instruction below will be incorrect.
399     *
400     * You should inspect the disasm output in order to verify that the MOV is
401     * not optimized away.
402     */
403    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404 #endif
405 
406    /* Give tmp the form below, where "." means untouched.
407     *
408     *     w z          y          x w z          y          x
409     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410     *
411     * That the upper word of each write-channel be 0 is required for the
412     * following bit-shift and bit-or instructions to work. Note that this
413     * relies on the undocumented hardware behavior mentioned above.
414     */
415    tmp_dst.writemask = WRITEMASK_XY;
416    emit(F32TO16(tmp_dst, src0));
417 
418    /* Give the write-channels of dst the form:
419     *   0xhhhh0000
420     */
421    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423 
424    /* Finally, give the write-channels of dst the form of packHalf2x16's
425     * output:
426     *   0xhhhhllll
427     */
428    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429    emit(OR(dst, src_reg(dst), tmp_src));
430 }
431 
432 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435    if (devinfo->gen < 7) {
436       unreachable("ir_unop_unpack_half_2x16 should be lowered");
437    }
438 
439    assert(dst.type == BRW_REGISTER_TYPE_F);
440    assert(src0.type == BRW_REGISTER_TYPE_UD);
441 
442    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443     *
444     *   Because this instruction does not have a 16-bit floating-point type,
445     *   the source data type must be Word (W). The destination type must be
446     *   F (Float).
447     *
448     * To use W as the source data type, we must adjust horizontal strides,
449     * which is only possible in align1 mode. All my [chadv] attempts at
450     * emitting align1 instructions for unpackHalf2x16 failed to pass the
451     * Piglit tests, so I gave up.
452     *
453     * I've verified that, on gen7 hardware and the simulator, it is safe to
454     * emit f16to32 in align16 mode with UD as source data type.
455     */
456 
457    dst_reg tmp_dst(this, glsl_type::uvec2_type);
458    src_reg tmp_src(tmp_dst);
459 
460    tmp_dst.writemask = WRITEMASK_X;
461    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462 
463    tmp_dst.writemask = WRITEMASK_Y;
464    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465 
466    dst.writemask = WRITEMASK_XY;
467    emit(F16TO32(dst, tmp_src));
468 }
469 
470 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)471 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472 {
473    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475     * is not suitable to generate the shift values, but we can use the packed
476     * vector float and a type-converting MOV.
477     */
478    dst_reg shift(this, glsl_type::uvec4_type);
479    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480 
481    dst_reg shifted(this, glsl_type::uvec4_type);
482    src0.swizzle = BRW_SWIZZLE_XXXX;
483    emit(SHR(shifted, src0, src_reg(shift)));
484 
485    shifted.type = BRW_REGISTER_TYPE_UB;
486    dst_reg f(this, glsl_type::vec4_type);
487    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488 
489    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490 }
491 
492 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)493 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497     * is not suitable to generate the shift values, but we can use the packed
498     * vector float and a type-converting MOV.
499     */
500    dst_reg shift(this, glsl_type::uvec4_type);
501    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502 
503    dst_reg shifted(this, glsl_type::uvec4_type);
504    src0.swizzle = BRW_SWIZZLE_XXXX;
505    emit(SHR(shifted, src0, src_reg(shift)));
506 
507    shifted.type = BRW_REGISTER_TYPE_B;
508    dst_reg f(this, glsl_type::vec4_type);
509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510 
511    dst_reg scaled(this, glsl_type::vec4_type);
512    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513 
514    dst_reg max(this, glsl_type::vec4_type);
515    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517 }
518 
519 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)520 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521 {
522    dst_reg saturated(this, glsl_type::vec4_type);
523    vec4_instruction *inst = emit(MOV(saturated, src0));
524    inst->saturate = true;
525 
526    dst_reg scaled(this, glsl_type::vec4_type);
527    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528 
529    dst_reg rounded(this, glsl_type::vec4_type);
530    emit(RNDE(rounded, src_reg(scaled)));
531 
532    dst_reg u(this, glsl_type::uvec4_type);
533    emit(MOV(u, src_reg(rounded)));
534 
535    src_reg bytes(u);
536    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537 }
538 
539 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)540 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542    dst_reg max(this, glsl_type::vec4_type);
543    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544 
545    dst_reg min(this, glsl_type::vec4_type);
546    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547 
548    dst_reg scaled(this, glsl_type::vec4_type);
549    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550 
551    dst_reg rounded(this, glsl_type::vec4_type);
552    emit(RNDE(rounded, src_reg(scaled)));
553 
554    dst_reg i(this, glsl_type::ivec4_type);
555    emit(MOV(i, src_reg(rounded)));
556 
557    src_reg bytes(i);
558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560 
561 /*
562  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563  * false) elements needed to pack a type.
564  */
565 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)566 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567 {
568    unsigned int i;
569    int size;
570 
571    switch (type->base_type) {
572    case GLSL_TYPE_UINT:
573    case GLSL_TYPE_INT:
574    case GLSL_TYPE_FLOAT:
575    case GLSL_TYPE_FLOAT16:
576    case GLSL_TYPE_BOOL:
577    case GLSL_TYPE_DOUBLE:
578    case GLSL_TYPE_UINT16:
579    case GLSL_TYPE_INT16:
580    case GLSL_TYPE_UINT8:
581    case GLSL_TYPE_INT8:
582    case GLSL_TYPE_UINT64:
583    case GLSL_TYPE_INT64:
584       if (type->is_matrix()) {
585          const glsl_type *col_type = type->column_type();
586          unsigned col_slots =
587             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588          return type->matrix_columns * col_slots;
589       } else {
590          /* Regardless of size of vector, it gets a vec4. This is bad
591           * packing for things like floats, but otherwise arrays become a
592           * mess.  Hopefully a later pass over the code can pack scalars
593           * down if appropriate.
594           */
595          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596       }
597    case GLSL_TYPE_ARRAY:
598       assert(type->length > 0);
599       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600              type->length;
601    case GLSL_TYPE_STRUCT:
602    case GLSL_TYPE_INTERFACE:
603       size = 0;
604       for (i = 0; i < type->length; i++) {
605 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606                                  bindless);
607       }
608       return size;
609    case GLSL_TYPE_SUBROUTINE:
610       return 1;
611 
612    case GLSL_TYPE_SAMPLER:
613       /* Samplers take up no register space, since they're baked in at
614        * link time.
615        */
616       return bindless ? 1 : 0;
617    case GLSL_TYPE_ATOMIC_UINT:
618       return 0;
619    case GLSL_TYPE_IMAGE:
620       return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621    case GLSL_TYPE_VOID:
622    case GLSL_TYPE_ERROR:
623    case GLSL_TYPE_FUNCTION:
624       unreachable("not reached");
625    }
626 
627    return 0;
628 }
629 
630 /**
631  * Returns the minimum number of vec4 elements needed to pack a type.
632  *
633  * For simple types, it will return 1 (a single vec4); for matrices, the
634  * number of columns; for array and struct, the sum of the vec4_size of
635  * each of its elements; and for sampler and atomic, zero.
636  *
637  * This method is useful to calculate how much register space is needed to
638  * store a particular type.
639  */
640 extern "C" int
type_size_vec4(const struct glsl_type * type,bool bindless)641 type_size_vec4(const struct glsl_type *type, bool bindless)
642 {
643    return type_size_xvec4(type, true, bindless);
644 }
645 
646 /**
647  * Returns the minimum number of dvec4 elements needed to pack a type.
648  *
649  * For simple types, it will return 1 (a single dvec4); for matrices, the
650  * number of columns; for array and struct, the sum of the dvec4_size of
651  * each of its elements; and for sampler and atomic, zero.
652  *
653  * This method is useful to calculate how much register space is needed to
654  * store a particular type.
655  *
656  * Measuring double-precision vertex inputs as dvec4 is required because
657  * ARB_vertex_attrib_64bit states that these uses the same number of locations
658  * than the single-precision version. That is, two consecutives dvec4 would be
659  * located in location "x" and location "x+1", not "x+2".
660  *
661  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
662  * remap_vs_attrs() will take in account both the location and also if the
663  * type fits in one or two vec4 slots.
664  */
665 extern "C" int
type_size_dvec4(const struct glsl_type * type,bool bindless)666 type_size_dvec4(const struct glsl_type *type, bool bindless)
667 {
668    return type_size_xvec4(type, false, bindless);
669 }
670 
src_reg(class vec4_visitor * v,const struct glsl_type * type)671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
672 {
673    init();
674 
675    this->file = VGRF;
676    this->nr = v->alloc.allocate(type_size_vec4(type, false));
677 
678    if (type->is_array() || type->is_struct()) {
679       this->swizzle = BRW_SWIZZLE_NOOP;
680    } else {
681       this->swizzle = brw_swizzle_for_size(type->vector_elements);
682    }
683 
684    this->type = brw_type_for_base_type(type);
685 }
686 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
688 {
689    assert(size > 0);
690 
691    init();
692 
693    this->file = VGRF;
694    this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
695 
696    this->swizzle = BRW_SWIZZLE_NOOP;
697 
698    this->type = brw_type_for_base_type(type);
699 }
700 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
702 {
703    init();
704 
705    this->file = VGRF;
706    this->nr = v->alloc.allocate(type_size_vec4(type, false));
707 
708    if (type->is_array() || type->is_struct()) {
709       this->writemask = WRITEMASK_XYZW;
710    } else {
711       this->writemask = (1 << type->vector_elements) - 1;
712    }
713 
714    this->type = brw_type_for_base_type(type);
715 }
716 
717 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
719                           src_reg src0, src_reg src1)
720 {
721    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
722    inst->conditional_mod = conditionalmod;
723    return inst;
724 }
725 
726 /**
727  * Emits the instructions needed to perform a pull constant load. before_block
728  * and before_inst can be NULL in which case the instruction will be appended
729  * to the end of the instruction list.
730  */
731 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
733                                           src_reg surf_index,
734                                           src_reg offset_reg,
735                                           bblock_t *before_block,
736                                           vec4_instruction *before_inst)
737 {
738    assert((before_inst == NULL && before_block == NULL) ||
739           (before_inst && before_block));
740 
741    vec4_instruction *pull;
742 
743    if (devinfo->gen >= 7) {
744       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
745 
746       grf_offset.type = offset_reg.type;
747 
748       pull = MOV(grf_offset, offset_reg);
749 
750       if (before_inst)
751          emit_before(before_block, before_inst, pull);
752       else
753          emit(pull);
754 
755       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
756                                            dst,
757                                            surf_index,
758                                            src_reg(grf_offset));
759       pull->mlen = 1;
760    } else {
761       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
762                                            dst,
763                                            surf_index,
764                                            offset_reg);
765       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
766       pull->mlen = 1;
767    }
768 
769    if (before_inst)
770       emit_before(before_block, before_inst, pull);
771    else
772       emit(pull);
773 }
774 
775 src_reg
emit_uniformize(const src_reg & src)776 vec4_visitor::emit_uniformize(const src_reg &src)
777 {
778    const src_reg chan_index(this, glsl_type::uint_type);
779    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
780                               src.type);
781 
782    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
783       ->force_writemask_all = true;
784    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
785       ->force_writemask_all = true;
786 
787    return src_reg(dst);
788 }
789 
790 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)791 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
792                              src_reg coordinate, src_reg surface)
793 {
794    vec4_instruction *inst =
795       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
796                                     dst_reg(this, glsl_type::uvec4_type));
797    inst->base_mrf = 2;
798    inst->src[1] = surface;
799    inst->src[2] = brw_imm_ud(0); /* sampler */
800    inst->mlen = 1;
801 
802    const int param_base = inst->base_mrf;
803 
804    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
805    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
806    int zero_mask = 0xf & ~coord_mask;
807 
808    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
809             coordinate));
810 
811    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
812             brw_imm_d(0)));
813 
814    emit(inst);
815    return src_reg(inst->dst);
816 }
817 
818 bool
is_high_sampler(src_reg sampler)819 vec4_visitor::is_high_sampler(src_reg sampler)
820 {
821    if (!devinfo->is_haswell)
822       return false;
823 
824    return sampler.file != IMM || sampler.ud >= 16;
825 }
826 
827 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)828 vec4_visitor::emit_texture(ir_texture_opcode op,
829                            dst_reg dest,
830                            const glsl_type *dest_type,
831                            src_reg coordinate,
832                            int coord_components,
833                            src_reg shadow_comparator,
834                            src_reg lod, src_reg lod2,
835                            src_reg sample_index,
836                            uint32_t constant_offset,
837                            src_reg offset_value,
838                            src_reg mcs,
839                            uint32_t surface,
840                            src_reg surface_reg,
841                            src_reg sampler_reg)
842 {
843    enum opcode opcode;
844    switch (op) {
845    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
846    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
847    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
848    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
849    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
850    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
851    case ir_tg4: opcode = offset_value.file != BAD_FILE
852                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
853    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
854    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
855    case ir_txb:
856       unreachable("TXB is not valid for vertex shaders.");
857    case ir_lod:
858       unreachable("LOD is not valid for vertex shaders.");
859    case ir_samples_identical: {
860       /* There are some challenges implementing this for vec4, and it seems
861        * unlikely to be used anyway.  For now, just return false ways.
862        */
863       emit(MOV(dest, brw_imm_ud(0u)));
864       return;
865    }
866    default:
867       unreachable("Unrecognized tex op");
868    }
869 
870    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
871 
872    inst->offset = constant_offset;
873 
874    /* The message header is necessary for:
875     * - Gen4 (always)
876     * - Texel offsets
877     * - Gather channel selection
878     * - Sampler indices too large to fit in a 4-bit value.
879     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
880     */
881    inst->header_size =
882       (devinfo->gen < 5 ||
883        inst->offset != 0 || op == ir_tg4 ||
884        op == ir_texture_samples ||
885        is_high_sampler(sampler_reg)) ? 1 : 0;
886    inst->base_mrf = 2;
887    inst->mlen = inst->header_size;
888    inst->dst.writemask = WRITEMASK_XYZW;
889    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
890 
891    inst->src[1] = surface_reg;
892    inst->src[2] = sampler_reg;
893 
894    /* MRF for the first parameter */
895    int param_base = inst->base_mrf + inst->header_size;
896 
897    if (op == ir_txs || op == ir_query_levels) {
898       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
899       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
900       inst->mlen++;
901    } else if (op == ir_texture_samples) {
902       inst->dst.writemask = WRITEMASK_X;
903    } else {
904       /* Load the coordinate */
905       /* FINISHME: gl_clamp_mask and saturate */
906       int coord_mask = (1 << coord_components) - 1;
907       int zero_mask = 0xf & ~coord_mask;
908 
909       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
910                coordinate));
911       inst->mlen++;
912 
913       if (zero_mask != 0) {
914          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
915                   brw_imm_d(0)));
916       }
917       /* Load the shadow comparator */
918       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
919 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
920 			  WRITEMASK_X),
921 		  shadow_comparator));
922 	 inst->mlen++;
923       }
924 
925       /* Load the LOD info */
926       if (op == ir_tex || op == ir_txl) {
927 	 int mrf, writemask;
928 	 if (devinfo->gen >= 5) {
929 	    mrf = param_base + 1;
930 	    if (shadow_comparator.file != BAD_FILE) {
931 	       writemask = WRITEMASK_Y;
932 	       /* mlen already incremented */
933 	    } else {
934 	       writemask = WRITEMASK_X;
935 	       inst->mlen++;
936 	    }
937 	 } else /* devinfo->gen == 4 */ {
938 	    mrf = param_base;
939 	    writemask = WRITEMASK_W;
940 	 }
941 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
942       } else if (op == ir_txf) {
943          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
944       } else if (op == ir_txf_ms) {
945          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
946                   sample_index));
947          if (devinfo->gen >= 7) {
948             /* MCS data is in the first channel of `mcs`, but we need to get it into
949              * the .y channel of the second vec4 of params, so replicate .x across
950              * the whole vec4 and then mask off everything except .y
951              */
952             mcs.swizzle = BRW_SWIZZLE_XXXX;
953             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
954                      mcs));
955          }
956          inst->mlen++;
957       } else if (op == ir_txd) {
958          const brw_reg_type type = lod.type;
959 
960 	 if (devinfo->gen >= 5) {
961 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
962 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
963 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
964 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
965 	    inst->mlen++;
966 
967 	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
968 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
969 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
970 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
971 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
972 	       inst->mlen++;
973 
974                if (shadow_comparator.file != BAD_FILE) {
975                   emit(MOV(dst_reg(MRF, param_base + 2,
976                                    shadow_comparator.type, WRITEMASK_Z),
977                            shadow_comparator));
978                }
979 	    }
980 	 } else /* devinfo->gen == 4 */ {
981 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
982 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
983 	    inst->mlen += 2;
984 	 }
985       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
986          if (shadow_comparator.file != BAD_FILE) {
987             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
988                      shadow_comparator));
989          }
990 
991          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
992                   offset_value));
993          inst->mlen++;
994       }
995    }
996 
997    emit(inst);
998 
999    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1000     * spec requires layers.
1001     */
1002    if (op == ir_txs && devinfo->gen < 7) {
1003       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1004       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1005                   src_reg(inst->dst), brw_imm_d(1));
1006    }
1007 
1008    if (devinfo->gen == 6 && op == ir_tg4) {
1009       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1010    }
1011 
1012    if (op == ir_query_levels) {
1013       /* # levels is in .w */
1014       src_reg swizzled(dest);
1015       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1016                                       SWIZZLE_W, SWIZZLE_W);
1017       emit(MOV(dest, swizzled));
1018    }
1019 }
1020 
1021 /**
1022  * Apply workarounds for Gen6 gather with UINT/SINT
1023  */
1024 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1025 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1026 {
1027    if (!wa)
1028       return;
1029 
1030    int width = (wa & WA_8BIT) ? 8 : 16;
1031    dst_reg dst_f = dst;
1032    dst_f.type = BRW_REGISTER_TYPE_F;
1033 
1034    /* Convert from UNORM to UINT */
1035    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1036    emit(MOV(dst, src_reg(dst_f)));
1037 
1038    if (wa & WA_SIGN) {
1039       /* Reinterpret the UINT value as a signed INT value by
1040        * shifting the sign bit into place, then shifting back
1041        * preserving sign.
1042        */
1043       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1044       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1045    }
1046 }
1047 
1048 void
gs_emit_vertex(int)1049 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1050 {
1051    unreachable("not reached");
1052 }
1053 
1054 void
gs_end_primitive()1055 vec4_visitor::gs_end_primitive()
1056 {
1057    unreachable("not reached");
1058 }
1059 
1060 void
emit_ndc_computation()1061 vec4_visitor::emit_ndc_computation()
1062 {
1063    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1064       return;
1065 
1066    /* Get the position */
1067    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1068 
1069    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1070    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1071    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1072    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1073 
1074    current_annotation = "NDC";
1075    dst_reg ndc_w = ndc;
1076    ndc_w.writemask = WRITEMASK_W;
1077    src_reg pos_w = pos;
1078    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1079    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1080 
1081    dst_reg ndc_xyz = ndc;
1082    ndc_xyz.writemask = WRITEMASK_XYZ;
1083 
1084    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1085 }
1086 
1087 void
emit_psiz_and_flags(dst_reg reg)1088 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1089 {
1090    if (devinfo->gen < 6 &&
1091        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1092         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1093         devinfo->has_negative_rhw_bug)) {
1094       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1095       dst_reg header1_w = header1;
1096       header1_w.writemask = WRITEMASK_W;
1097 
1098       emit(MOV(header1, brw_imm_ud(0u)));
1099 
1100       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1101 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1102 
1103 	 current_annotation = "Point size";
1104 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1105 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1106       }
1107 
1108       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1109          current_annotation = "Clipping flags";
1110          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1111 
1112          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1113          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1114          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1115       }
1116 
1117       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1118          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1119          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1120          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1121          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1122          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1123       }
1124 
1125       /* i965 clipping workaround:
1126        * 1) Test for -ve rhw
1127        * 2) If set,
1128        *      set ndc = (0,0,0,0)
1129        *      set ucp[6] = 1
1130        *
1131        * Later, clipping will detect ucp[6] and ensure the primitive is
1132        * clipped against all fixed planes.
1133        */
1134       if (devinfo->has_negative_rhw_bug &&
1135           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1136          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1137          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1138          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1139          vec4_instruction *inst;
1140          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1141          inst->predicate = BRW_PREDICATE_NORMAL;
1142          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1143          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1144          inst->predicate = BRW_PREDICATE_NORMAL;
1145       }
1146 
1147       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1148    } else if (devinfo->gen < 6) {
1149       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1150    } else {
1151       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1152       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1153          dst_reg reg_w = reg;
1154          reg_w.writemask = WRITEMASK_W;
1155          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1156          reg_as_src.type = reg_w.type;
1157          reg_as_src.swizzle = brw_swizzle_for_size(1);
1158          emit(MOV(reg_w, reg_as_src));
1159       }
1160       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1161          dst_reg reg_y = reg;
1162          reg_y.writemask = WRITEMASK_Y;
1163          reg_y.type = BRW_REGISTER_TYPE_D;
1164          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1165          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1166       }
1167       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1168          dst_reg reg_z = reg;
1169          reg_z.writemask = WRITEMASK_Z;
1170          reg_z.type = BRW_REGISTER_TYPE_D;
1171          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1172          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1173       }
1174    }
1175 }
1176 
1177 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1178 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1179 {
1180    assert(varying < VARYING_SLOT_MAX);
1181 
1182    unsigned num_comps = output_num_components[varying][component];
1183    if (num_comps == 0)
1184       return NULL;
1185 
1186    assert(output_reg[varying][component].type == reg.type);
1187    current_annotation = output_reg_annotation[varying];
1188    if (output_reg[varying][component].file != BAD_FILE) {
1189       src_reg src = src_reg(output_reg[varying][component]);
1190       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1191       reg.writemask =
1192          brw_writemask_for_component_packing(num_comps, component);
1193       return emit(MOV(reg, src));
1194    }
1195    return NULL;
1196 }
1197 
1198 void
emit_urb_slot(dst_reg reg,int varying)1199 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1200 {
1201    reg.type = BRW_REGISTER_TYPE_F;
1202    output_reg[varying][0].type = reg.type;
1203 
1204    switch (varying) {
1205    case VARYING_SLOT_PSIZ:
1206    {
1207       /* PSIZ is always in slot 0, and is coupled with other flags. */
1208       current_annotation = "indices, point width, clip flags";
1209       emit_psiz_and_flags(reg);
1210       break;
1211    }
1212    case BRW_VARYING_SLOT_NDC:
1213       current_annotation = "NDC";
1214       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1215          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1216       break;
1217    case VARYING_SLOT_POS:
1218       current_annotation = "gl_Position";
1219       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1220          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1221       break;
1222    case VARYING_SLOT_EDGE: {
1223       /* This is present when doing unfilled polygons.  We're supposed to copy
1224        * the edge flag from the user-provided vertex array
1225        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1226        * of that attribute (starts as 1.0f).  This is then used in clipping to
1227        * determine which edges should be drawn as wireframe.
1228        */
1229       current_annotation = "edge flag";
1230       int edge_attr = util_bitcount64(nir->info.inputs_read &
1231                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1232       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1233                                     glsl_type::float_type, WRITEMASK_XYZW))));
1234       break;
1235    }
1236    case BRW_VARYING_SLOT_PAD:
1237       /* No need to write to this slot */
1238       break;
1239    default:
1240       for (int i = 0; i < 4; i++) {
1241          emit_generic_urb_slot(reg, varying, i);
1242       }
1243       break;
1244    }
1245 }
1246 
1247 static unsigned
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,unsigned mlen)1248 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1249 {
1250    if (devinfo->gen >= 6) {
1251       /* URB data written (does not include the message header reg) must
1252        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1253        * section 5.4.3.2.2: URB_INTERLEAVED.
1254        *
1255        * URB entries are allocated on a multiple of 1024 bits, so an
1256        * extra 128 bits written here to make the end align to 256 is
1257        * no problem.
1258        */
1259       if ((mlen % 2) != 1)
1260 	 mlen++;
1261    }
1262 
1263    return mlen;
1264 }
1265 
1266 
1267 /**
1268  * Generates the VUE payload plus the necessary URB write instructions to
1269  * output it.
1270  *
1271  * The VUE layout is documented in Volume 2a.
1272  */
1273 void
emit_vertex()1274 vec4_visitor::emit_vertex()
1275 {
1276    /* MRF 0 is reserved for the debugger, so start with message header
1277     * in MRF 1.
1278     */
1279    int base_mrf = 1;
1280    int mrf = base_mrf;
1281    /* In the process of generating our URB write message contents, we
1282     * may need to unspill a register or load from an array.  Those
1283     * reads would use MRFs 14-15.
1284     */
1285    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1286 
1287    /* The following assertion verifies that max_usable_mrf causes an
1288     * even-numbered amount of URB write data, which will meet gen6's
1289     * requirements for length alignment.
1290     */
1291    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1292 
1293    /* First mrf is the g0-based message header containing URB handles and
1294     * such.
1295     */
1296    emit_urb_write_header(mrf++);
1297 
1298    if (devinfo->gen < 6) {
1299       emit_ndc_computation();
1300    }
1301 
1302    /* We may need to split this up into several URB writes, so do them in a
1303     * loop.
1304     */
1305    int slot = 0;
1306    bool complete = false;
1307    do {
1308       /* URB offset is in URB row increments, and each of our MRFs is half of
1309        * one of those, since we're doing interleaved writes.
1310        */
1311       int offset = slot / 2;
1312 
1313       mrf = base_mrf + 1;
1314       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1315          emit_urb_slot(dst_reg(MRF, mrf++),
1316                        prog_data->vue_map.slot_to_varying[slot]);
1317 
1318          /* If this was max_usable_mrf, we can't fit anything more into this
1319           * URB WRITE. Same thing if we reached the maximum length available.
1320           */
1321          if (mrf > max_usable_mrf ||
1322              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1323             slot++;
1324             break;
1325          }
1326       }
1327 
1328       complete = slot >= prog_data->vue_map.num_slots;
1329       current_annotation = "URB write";
1330       vec4_instruction *inst = emit_urb_write_opcode(complete);
1331       inst->base_mrf = base_mrf;
1332       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1333       inst->offset += offset;
1334    } while(!complete);
1335 }
1336 
1337 
1338 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1339 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1340 				 src_reg *reladdr, int reg_offset)
1341 {
1342    /* Because we store the values to scratch interleaved like our
1343     * vertex data, we need to scale the vec4 index by 2.
1344     */
1345    int message_header_scale = 2;
1346 
1347    /* Pre-gen6, the message header uses byte offsets instead of vec4
1348     * (16-byte) offset units.
1349     */
1350    if (devinfo->gen < 6)
1351       message_header_scale *= 16;
1352 
1353    if (reladdr) {
1354       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1355        * to multiply the reladdr by 2. Notice that the reg_offset part
1356        * is in units of 16 bytes and is used to select the low/high 16-byte
1357        * chunk of a full dvec4, so we don't want to multiply that part.
1358        */
1359       src_reg index = src_reg(this, glsl_type::int_type);
1360       if (type_sz(inst->dst.type) < 8) {
1361          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1362                                       brw_imm_d(reg_offset)));
1363          emit_before(block, inst, MUL(dst_reg(index), index,
1364                                       brw_imm_d(message_header_scale)));
1365       } else {
1366          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1367                                       brw_imm_d(message_header_scale * 2)));
1368          emit_before(block, inst, ADD(dst_reg(index), index,
1369                                       brw_imm_d(reg_offset * message_header_scale)));
1370       }
1371       return index;
1372    } else {
1373       return brw_imm_d(reg_offset * message_header_scale);
1374    }
1375 }
1376 
1377 /**
1378  * Emits an instruction before @inst to load the value named by @orig_src
1379  * from scratch space at @base_offset to @temp.
1380  *
1381  * @base_offset is measured in 32-byte units (the size of a register).
1382  */
1383 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1384 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1385 				dst_reg temp, src_reg orig_src,
1386 				int base_offset)
1387 {
1388    assert(orig_src.offset % REG_SIZE == 0);
1389    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1390    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1391                                       reg_offset);
1392 
1393    if (type_sz(orig_src.type) < 8) {
1394       emit_before(block, inst, SCRATCH_READ(temp, index));
1395    } else {
1396       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1397       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1398       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1399       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1400       vec4_instruction *last_read =
1401          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1402       emit_before(block, inst, last_read);
1403       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1404    }
1405 }
1406 
1407 /**
1408  * Emits an instruction after @inst to store the value to be written
1409  * to @orig_dst to scratch space at @base_offset, from @temp.
1410  *
1411  * @base_offset is measured in 32-byte units (the size of a register).
1412  */
1413 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1414 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1415                                  int base_offset)
1416 {
1417    assert(inst->dst.offset % REG_SIZE == 0);
1418    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1419    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1420                                       reg_offset);
1421 
1422    /* Create a temporary register to store *inst's result in.
1423     *
1424     * We have to be careful in MOVing from our temporary result register in
1425     * the scratch write.  If we swizzle from channels of the temporary that
1426     * weren't initialized, it will confuse live interval analysis, which will
1427     * make spilling fail to make progress.
1428     */
1429    bool is_64bit = type_sz(inst->dst.type) == 8;
1430    const glsl_type *alloc_type =
1431       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1432    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1433                                        inst->dst.type),
1434                                 brw_swizzle_for_mask(inst->dst.writemask));
1435 
1436    if (!is_64bit) {
1437       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1438 				          inst->dst.writemask));
1439       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1440       if (inst->opcode != BRW_OPCODE_SEL)
1441          write->predicate = inst->predicate;
1442       write->ir = inst->ir;
1443       write->annotation = inst->annotation;
1444       inst->insert_after(block, write);
1445    } else {
1446       dst_reg shuffled = dst_reg(this, alloc_type);
1447       vec4_instruction *last =
1448          shuffle_64bit_data(shuffled, temp, true, block, inst);
1449       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1450 
1451       uint8_t mask = 0;
1452       if (inst->dst.writemask & WRITEMASK_X)
1453          mask |= WRITEMASK_XY;
1454       if (inst->dst.writemask & WRITEMASK_Y)
1455          mask |= WRITEMASK_ZW;
1456       if (mask) {
1457          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1458 
1459          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1460          if (inst->opcode != BRW_OPCODE_SEL)
1461             write->predicate = inst->predicate;
1462          write->ir = inst->ir;
1463          write->annotation = inst->annotation;
1464          last->insert_after(block, write);
1465       }
1466 
1467       mask = 0;
1468       if (inst->dst.writemask & WRITEMASK_Z)
1469          mask |= WRITEMASK_XY;
1470       if (inst->dst.writemask & WRITEMASK_W)
1471          mask |= WRITEMASK_ZW;
1472       if (mask) {
1473          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1474 
1475          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1476                                             reg_offset + 1);
1477          vec4_instruction *write =
1478             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1479          if (inst->opcode != BRW_OPCODE_SEL)
1480             write->predicate = inst->predicate;
1481          write->ir = inst->ir;
1482          write->annotation = inst->annotation;
1483          last->insert_after(block, write);
1484       }
1485    }
1486 
1487    inst->dst.file = temp.file;
1488    inst->dst.nr = temp.nr;
1489    inst->dst.offset %= REG_SIZE;
1490    inst->dst.reladdr = NULL;
1491 }
1492 
1493 /**
1494  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1495  * adds the scratch read(s) before \p inst. The function also checks for
1496  * recursive reladdr scratch accesses, issuing the corresponding scratch
1497  * loads and rewriting reladdr references accordingly.
1498  *
1499  * \return \p src if it did not require a scratch load, otherwise, the
1500  * register holding the result of the scratch load that the caller should
1501  * use to rewrite src.
1502  */
1503 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1504 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1505                                    vec4_instruction *inst, src_reg src)
1506 {
1507    /* Resolve recursive reladdr scratch access by calling ourselves
1508     * with src.reladdr
1509     */
1510    if (src.reladdr)
1511       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1512                                           *src.reladdr);
1513 
1514    /* Now handle scratch access on src */
1515    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1516       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1517          glsl_type::dvec4_type : glsl_type::vec4_type);
1518       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1519       src.nr = temp.nr;
1520       src.offset %= REG_SIZE;
1521       src.reladdr = NULL;
1522    }
1523 
1524    return src;
1525 }
1526 
1527 /**
1528  * We can't generally support array access in GRF space, because a
1529  * single instruction's destination can only span 2 contiguous
1530  * registers.  So, we send all GRF arrays that get variable index
1531  * access to scratch space.
1532  */
1533 void
move_grf_array_access_to_scratch()1534 vec4_visitor::move_grf_array_access_to_scratch()
1535 {
1536    int scratch_loc[this->alloc.count];
1537    memset(scratch_loc, -1, sizeof(scratch_loc));
1538 
1539    /* First, calculate the set of virtual GRFs that need to be punted
1540     * to scratch due to having any array access on them, and where in
1541     * scratch.
1542     */
1543    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1544       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1545          if (scratch_loc[inst->dst.nr] == -1) {
1546             scratch_loc[inst->dst.nr] = last_scratch;
1547             last_scratch += this->alloc.sizes[inst->dst.nr];
1548          }
1549 
1550          for (src_reg *iter = inst->dst.reladdr;
1551               iter->reladdr;
1552               iter = iter->reladdr) {
1553             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1554                scratch_loc[iter->nr] = last_scratch;
1555                last_scratch += this->alloc.sizes[iter->nr];
1556             }
1557          }
1558       }
1559 
1560       for (int i = 0 ; i < 3; i++) {
1561          for (src_reg *iter = &inst->src[i];
1562               iter->reladdr;
1563               iter = iter->reladdr) {
1564             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1565                scratch_loc[iter->nr] = last_scratch;
1566                last_scratch += this->alloc.sizes[iter->nr];
1567             }
1568          }
1569       }
1570    }
1571 
1572    /* Now, for anything that will be accessed through scratch, rewrite
1573     * it to load/store.  Note that this is a _safe list walk, because
1574     * we may generate a new scratch_write instruction after the one
1575     * we're processing.
1576     */
1577    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1578       /* Set up the annotation tracking for new generated instructions. */
1579       base_ir = inst->ir;
1580       current_annotation = inst->annotation;
1581 
1582       /* First handle scratch access on the dst. Notice we have to handle
1583        * the case where the dst's reladdr also points to scratch space.
1584        */
1585       if (inst->dst.reladdr)
1586          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1587                                                    *inst->dst.reladdr);
1588 
1589       /* Now that we have handled any (possibly recursive) reladdr scratch
1590        * accesses for dst we can safely do the scratch write for dst itself
1591        */
1592       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1593          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1594 
1595       /* Now handle scratch access on any src. In this case, since inst->src[i]
1596        * already is a src_reg, we can just call emit_resolve_reladdr with
1597        * inst->src[i] and it will take care of handling scratch loads for
1598        * both src and src.reladdr (recursively).
1599        */
1600       for (int i = 0 ; i < 3; i++) {
1601          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1602                                              inst->src[i]);
1603       }
1604    }
1605 }
1606 
1607 /**
1608  * Emits an instruction before @inst to load the value named by @orig_src
1609  * from the pull constant buffer (surface) at @base_offset to @temp.
1610  */
1611 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1612 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1613                                       dst_reg temp, src_reg orig_src,
1614                                       int base_offset, src_reg indirect)
1615 {
1616    assert(orig_src.offset % 16 == 0);
1617    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1618 
1619    /* For 64bit loads we need to emit two 32-bit load messages and we also
1620     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1621     * that we emit the 32-bit loads into a temporary and we shuffle the result
1622     * into the original destination.
1623     */
1624    dst_reg orig_temp = temp;
1625    bool is_64bit = type_sz(orig_src.type) == 8;
1626    if (is_64bit) {
1627       assert(type_sz(temp.type) == 8);
1628       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1629       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1630    }
1631 
1632    src_reg src = orig_src;
1633    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1634       int reg_offset = base_offset + src.offset / 16;
1635 
1636       src_reg offset;
1637       if (indirect.file != BAD_FILE) {
1638          offset = src_reg(this, glsl_type::uint_type);
1639          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1640                                       brw_imm_ud(reg_offset * 16)));
1641       } else {
1642          offset = brw_imm_d(reg_offset * 16);
1643       }
1644 
1645       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1646                                   brw_imm_ud(index),
1647                                   offset,
1648                                   block, inst);
1649 
1650       src = byte_offset(src, 16);
1651    }
1652 
1653    if (is_64bit) {
1654       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1655       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1656    }
1657 }
1658 
1659 /**
1660  * Implements array access of uniforms by inserting a
1661  * PULL_CONSTANT_LOAD instruction.
1662  *
1663  * Unlike temporary GRF array access (where we don't support it due to
1664  * the difficulty of doing relative addressing on instruction
1665  * destinations), we could potentially do array access of uniforms
1666  * that were loaded in GRF space as push constants.  In real-world
1667  * usage we've seen, though, the arrays being used are always larger
1668  * than we could load as push constants, so just always move all
1669  * uniform array access out to a pull constant buffer.
1670  */
1671 void
move_uniform_array_access_to_pull_constants()1672 vec4_visitor::move_uniform_array_access_to_pull_constants()
1673 {
1674    /* The vulkan dirver doesn't support pull constants other than UBOs so
1675     * everything has to be pushed regardless.
1676     */
1677    if (!compiler->supports_pull_constants) {
1678       split_uniform_registers();
1679       return;
1680    }
1681 
1682    /* Allocate the pull_params array */
1683    assert(stage_prog_data->nr_pull_params == 0);
1684    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1685                                               this->uniforms * 4);
1686 
1687    int pull_constant_loc[this->uniforms];
1688    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1689 
1690    /* First, walk through the instructions and determine which things need to
1691     * be pulled.  We mark something as needing to be pulled by setting
1692     * pull_constant_loc to 0.
1693     */
1694    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1695       /* We only care about MOV_INDIRECT of a uniform */
1696       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1697           inst->src[0].file != UNIFORM)
1698          continue;
1699 
1700       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1701 
1702       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1703          pull_constant_loc[uniform_nr + j] = 0;
1704    }
1705 
1706    /* Next, we walk the list of uniforms and assign real pull constant
1707     * locations and set their corresponding entries in pull_param.
1708     */
1709    for (int j = 0; j < this->uniforms; j++) {
1710       if (pull_constant_loc[j] < 0)
1711          continue;
1712 
1713       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1714 
1715       for (int i = 0; i < 4; i++) {
1716          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1717             = stage_prog_data->param[j * 4 + i];
1718       }
1719    }
1720 
1721    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1722     * instructions to actual uniform pulls.
1723     */
1724    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1725       /* We only care about MOV_INDIRECT of a uniform */
1726       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1727           inst->src[0].file != UNIFORM)
1728          continue;
1729 
1730       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1731 
1732       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1733 
1734       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1735                               pull_constant_loc[uniform_nr], inst->src[1]);
1736       inst->remove(block);
1737    }
1738 
1739    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1740     * no need to track them as larger-than-vec4 objects.  This will be
1741     * relied on in cutting out unused uniform vectors from push
1742     * constants.
1743     */
1744    split_uniform_registers();
1745 }
1746 
1747 void
resolve_ud_negate(src_reg * reg)1748 vec4_visitor::resolve_ud_negate(src_reg *reg)
1749 {
1750    if (reg->type != BRW_REGISTER_TYPE_UD ||
1751        !reg->negate)
1752       return;
1753 
1754    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1755    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1756    *reg = temp;
1757 }
1758 
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1759 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1760                            void *log_data,
1761                            const struct brw_sampler_prog_key_data *key_tex,
1762                            struct brw_vue_prog_data *prog_data,
1763                            const nir_shader *shader,
1764 			   void *mem_ctx,
1765                            bool no_spills,
1766                            int shader_time_index)
1767    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1768      key_tex(key_tex),
1769      prog_data(prog_data),
1770      fail_msg(NULL),
1771      first_non_payload_grf(0),
1772      live_analysis(this), performance_analysis(this),
1773      need_all_constants_in_pull_buffer(false),
1774      no_spills(no_spills),
1775      shader_time_index(shader_time_index),
1776      last_scratch(0)
1777 {
1778    this->failed = false;
1779 
1780    this->base_ir = NULL;
1781    this->current_annotation = NULL;
1782    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1783 
1784    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1785 
1786    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1787 
1788    this->uniforms = 0;
1789 
1790    this->nir_locals = NULL;
1791    this->nir_ssa_values = NULL;
1792 }
1793 
1794 
1795 void
fail(const char * format,...)1796 vec4_visitor::fail(const char *format, ...)
1797 {
1798    va_list va;
1799    char *msg;
1800 
1801    if (failed)
1802       return;
1803 
1804    failed = true;
1805 
1806    va_start(va, format);
1807    msg = ralloc_vasprintf(mem_ctx, format, va);
1808    va_end(va);
1809    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1810 
1811    this->fail_msg = msg;
1812 
1813    if (debug_enabled) {
1814       fprintf(stderr, "%s",  msg);
1815    }
1816 }
1817 
1818 } /* namespace brw */
1819