1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 
28 namespace brw {
29 
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
31                                    const src_reg &src0, const src_reg &src1,
32                                    const src_reg &src2)
33 {
34    this->opcode = opcode;
35    this->dst = dst;
36    this->src[0] = src0;
37    this->src[1] = src1;
38    this->src[2] = src2;
39    this->saturate = false;
40    this->force_writemask_all = false;
41    this->no_dd_clear = false;
42    this->no_dd_check = false;
43    this->writes_accumulator = false;
44    this->conditional_mod = BRW_CONDITIONAL_NONE;
45    this->predicate = BRW_PREDICATE_NONE;
46    this->predicate_inverse = false;
47    this->target = 0;
48    this->shadow_compare = false;
49    this->eot = false;
50    this->ir = NULL;
51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52    this->header_size = 0;
53    this->flag_subreg = 0;
54    this->mlen = 0;
55    this->base_mrf = 0;
56    this->offset = 0;
57    this->exec_size = 8;
58    this->group = 0;
59    this->size_written = (dst.file == BAD_FILE ?
60                          0 : this->exec_size * type_sz(dst.type));
61    this->annotation = NULL;
62 }
63 
64 vec4_instruction *
emit(vec4_instruction * inst)65 vec4_visitor::emit(vec4_instruction *inst)
66 {
67    inst->ir = this->base_ir;
68    inst->annotation = this->current_annotation;
69 
70    this->instructions.push_tail(inst);
71 
72    return inst;
73 }
74 
75 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77                           vec4_instruction *new_inst)
78 {
79    new_inst->ir = inst->ir;
80    new_inst->annotation = inst->annotation;
81 
82    inst->insert_before(block, new_inst);
83 
84    return inst;
85 }
86 
87 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89                    const src_reg &src1, const src_reg &src2)
90 {
91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92 }
93 
94 
95 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97                    const src_reg &src1)
98 {
99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100 }
101 
102 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104 {
105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106 }
107 
108 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110 {
111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112 }
113 
114 vec4_instruction *
emit(enum opcode opcode)115 vec4_visitor::emit(enum opcode opcode)
116 {
117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118 }
119 
120 #define ALU1(op)							\
121    vec4_instruction *							\
122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
123    {									\
124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125    }
126 
127 #define ALU2(op)							\
128    vec4_instruction *							\
129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
130                     const src_reg &src1)				\
131    {									\
132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
133                                            src0, src1);                 \
134    }
135 
136 #define ALU2_ACC(op)							\
137    vec4_instruction *							\
138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
139                     const src_reg &src1)				\
140    {									\
141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
142                        BRW_OPCODE_##op, dst, src0, src1);		\
143       inst->writes_accumulator = true;                                  \
144       return inst;                                                      \
145    }
146 
147 #define ALU3(op)							\
148    vec4_instruction *							\
149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
150                     const src_reg &src1, const src_reg &src2)		\
151    {									\
152       assert(devinfo->gen >= 6);						\
153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
154 					   src0, src1, src2);		\
155    }
156 
157 ALU1(NOT)
ALU1(MOV)158 ALU1(MOV)
159 ALU1(FRC)
160 ALU1(RNDD)
161 ALU1(RNDE)
162 ALU1(RNDZ)
163 ALU1(F32TO16)
164 ALU1(F16TO32)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2_ACC(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(DP3)
172 ALU2(DP4)
173 ALU2(DPH)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(MAC)
189 ALU1(DIM)
190 
191 /** Gen4 predicated IF. */
192 vec4_instruction *
193 vec4_visitor::IF(enum brw_predicate predicate)
194 {
195    vec4_instruction *inst;
196 
197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198    inst->predicate = predicate;
199 
200    return inst;
201 }
202 
203 /** Gen6 IF with embedded comparison. */
204 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)205 vec4_visitor::IF(src_reg src0, src_reg src1,
206                  enum brw_conditional_mod condition)
207 {
208    assert(devinfo->gen == 6);
209 
210    vec4_instruction *inst;
211 
212    resolve_ud_negate(&src0);
213    resolve_ud_negate(&src1);
214 
215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216 					src0, src1);
217    inst->conditional_mod = condition;
218 
219    return inst;
220 }
221 
222 /**
223  * CMP: Sets the low bit of the destination channels with the result
224  * of the comparison, while the upper bits are undefined, and updates
225  * the flag register with the packed 16 bits of the result.
226  */
227 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229                   enum brw_conditional_mod condition)
230 {
231    vec4_instruction *inst;
232 
233    /* Take the instruction:
234     *
235     * CMP null<d> src0<f> src1<f>
236     *
237     * Original gen4 does type conversion to the destination type before
238     * comparison, producing garbage results for floating point comparisons.
239     *
240     * The destination type doesn't matter on newer generations, so we set the
241     * type to match src0 so we can compact the instruction.
242     */
243    dst.type = src0.type;
244 
245    resolve_ud_negate(&src0);
246    resolve_ud_negate(&src1);
247 
248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249    inst->conditional_mod = condition;
250 
251    return inst;
252 }
253 
254 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257    vec4_instruction *inst;
258 
259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 					dst, index);
261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262    inst->mlen = 2;
263 
264    return inst;
265 }
266 
267 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269                             const src_reg &index)
270 {
271    vec4_instruction *inst;
272 
273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 					dst, src, index);
275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276    inst->mlen = 3;
277 
278    return inst;
279 }
280 
281 src_reg
fix_3src_operand(const src_reg & src)282 vec4_visitor::fix_3src_operand(const src_reg &src)
283 {
284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285     * able to use vertical stride of zero to replicate the vec4 uniform, like
286     *
287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288     *
289     * But you can't, since vertical stride is always four in three-source
290     * instructions. Instead, insert a MOV instruction to do the replication so
291     * that the three-source instruction can consume it.
292     */
293 
294    /* The MOV is only needed if the source is a uniform or immediate. */
295    if (src.file != UNIFORM && src.file != IMM)
296       return src;
297 
298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299       return src;
300 
301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302    expanded.type = src.type;
303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304    return src_reg(expanded);
305 }
306 
307 src_reg
resolve_source_modifiers(const src_reg & src)308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
309 {
310    if (!src.abs && !src.negate)
311       return src;
312 
313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314    resolved.type = src.type;
315    emit(MOV(resolved, src));
316 
317    return src_reg(resolved);
318 }
319 
320 src_reg
fix_math_operand(const src_reg & src)321 vec4_visitor::fix_math_operand(const src_reg &src)
322 {
323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324       return src;
325 
326    /* The gen6 math instruction ignores the source modifiers --
327     * swizzle, abs, negate, and at least some parts of the register
328     * region description.
329     *
330     * Rather than trying to enumerate all these cases, *always* expand the
331     * operand to a temp GRF for gen6.
332     *
333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
334     * can't use.
335     */
336 
337    if (devinfo->gen == 7 && src.file != IMM)
338       return src;
339 
340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341    expanded.type = src.type;
342    emit(MOV(expanded, src));
343    return src_reg(expanded);
344 }
345 
346 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)347 vec4_visitor::emit_math(enum opcode opcode,
348                         const dst_reg &dst,
349                         const src_reg &src0, const src_reg &src1)
350 {
351    vec4_instruction *math =
352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353 
354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
356       math->dst = dst_reg(this, glsl_type::vec4_type);
357       math->dst.type = dst.type;
358       math = emit(MOV(dst, src_reg(math->dst)));
359    } else if (devinfo->gen < 6) {
360       math->base_mrf = 1;
361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
362    }
363 
364    return math;
365 }
366 
367 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369 {
370    if (devinfo->gen < 7) {
371       unreachable("ir_unop_pack_half_2x16 should be lowered");
372    }
373 
374    assert(dst.type == BRW_REGISTER_TYPE_UD);
375    assert(src0.type == BRW_REGISTER_TYPE_F);
376 
377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378     *
379     *   Because this instruction does not have a 16-bit floating-point type,
380     *   the destination data type must be Word (W).
381     *
382     *   The destination must be DWord-aligned and specify a horizontal stride
383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
384     *   each destination channel and the upper word is not modified.
385     *
386     * The above restriction implies that the f32to16 instruction must use
387     * align1 mode, because only in align1 mode is it possible to specify
388     * horizontal stride.  We choose here to defy the hardware docs and emit
389     * align16 instructions.
390     *
391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392     * instructions. I was partially successful in that the code passed all
393     * tests.  However, the code was dubiously correct and fragile, and the
394     * tests were not harsh enough to probe that frailty. Not trusting the
395     * code, I chose instead to remain in align16 mode in defiance of the hw
396     * docs).
397     *
398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399     * simulator, emitting a f32to16 in align16 mode with UD as destination
400     * data type is safe. The behavior differs from that specified in the PRM
401     * in that the upper word of each destination channel is cleared to 0.
402     */
403 
404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
405    src_reg tmp_src(tmp_dst);
406 
407 #if 0
408    /* Verify the undocumented behavior on which the following instructions
409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
410     * then the result of the bit-or instruction below will be incorrect.
411     *
412     * You should inspect the disasm output in order to verify that the MOV is
413     * not optimized away.
414     */
415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416 #endif
417 
418    /* Give tmp the form below, where "." means untouched.
419     *
420     *     w z          y          x w z          y          x
421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422     *
423     * That the upper word of each write-channel be 0 is required for the
424     * following bit-shift and bit-or instructions to work. Note that this
425     * relies on the undocumented hardware behavior mentioned above.
426     */
427    tmp_dst.writemask = WRITEMASK_XY;
428    emit(F32TO16(tmp_dst, src0));
429 
430    /* Give the write-channels of dst the form:
431     *   0xhhhh0000
432     */
433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435 
436    /* Finally, give the write-channels of dst the form of packHalf2x16's
437     * output:
438     *   0xhhhhllll
439     */
440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441    emit(OR(dst, src_reg(dst), tmp_src));
442 }
443 
444 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446 {
447    if (devinfo->gen < 7) {
448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
449    }
450 
451    assert(dst.type == BRW_REGISTER_TYPE_F);
452    assert(src0.type == BRW_REGISTER_TYPE_UD);
453 
454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455     *
456     *   Because this instruction does not have a 16-bit floating-point type,
457     *   the source data type must be Word (W). The destination type must be
458     *   F (Float).
459     *
460     * To use W as the source data type, we must adjust horizontal strides,
461     * which is only possible in align1 mode. All my [chadv] attempts at
462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
463     * Piglit tests, so I gave up.
464     *
465     * I've verified that, on gen7 hardware and the simulator, it is safe to
466     * emit f16to32 in align16 mode with UD as source data type.
467     */
468 
469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
470    src_reg tmp_src(tmp_dst);
471 
472    tmp_dst.writemask = WRITEMASK_X;
473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474 
475    tmp_dst.writemask = WRITEMASK_Y;
476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477 
478    dst.writemask = WRITEMASK_XY;
479    emit(F16TO32(dst, tmp_src));
480 }
481 
482 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484 {
485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487     * is not suitable to generate the shift values, but we can use the packed
488     * vector float and a type-converting MOV.
489     */
490    dst_reg shift(this, glsl_type::uvec4_type);
491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492 
493    dst_reg shifted(this, glsl_type::uvec4_type);
494    src0.swizzle = BRW_SWIZZLE_XXXX;
495    emit(SHR(shifted, src0, src_reg(shift)));
496 
497    shifted.type = BRW_REGISTER_TYPE_UB;
498    dst_reg f(this, glsl_type::vec4_type);
499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500 
501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502 }
503 
504 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506 {
507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509     * is not suitable to generate the shift values, but we can use the packed
510     * vector float and a type-converting MOV.
511     */
512    dst_reg shift(this, glsl_type::uvec4_type);
513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514 
515    dst_reg shifted(this, glsl_type::uvec4_type);
516    src0.swizzle = BRW_SWIZZLE_XXXX;
517    emit(SHR(shifted, src0, src_reg(shift)));
518 
519    shifted.type = BRW_REGISTER_TYPE_B;
520    dst_reg f(this, glsl_type::vec4_type);
521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522 
523    dst_reg scaled(this, glsl_type::vec4_type);
524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525 
526    dst_reg max(this, glsl_type::vec4_type);
527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529 }
530 
531 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533 {
534    dst_reg saturated(this, glsl_type::vec4_type);
535    vec4_instruction *inst = emit(MOV(saturated, src0));
536    inst->saturate = true;
537 
538    dst_reg scaled(this, glsl_type::vec4_type);
539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540 
541    dst_reg rounded(this, glsl_type::vec4_type);
542    emit(RNDE(rounded, src_reg(scaled)));
543 
544    dst_reg u(this, glsl_type::uvec4_type);
545    emit(MOV(u, src_reg(rounded)));
546 
547    src_reg bytes(u);
548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549 }
550 
551 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553 {
554    dst_reg max(this, glsl_type::vec4_type);
555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556 
557    dst_reg min(this, glsl_type::vec4_type);
558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559 
560    dst_reg scaled(this, glsl_type::vec4_type);
561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562 
563    dst_reg rounded(this, glsl_type::vec4_type);
564    emit(RNDE(rounded, src_reg(scaled)));
565 
566    dst_reg i(this, glsl_type::ivec4_type);
567    emit(MOV(i, src_reg(rounded)));
568 
569    src_reg bytes(i);
570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571 }
572 
573 /*
574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575  * false) elements needed to pack a type.
576  */
577 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4)578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579 {
580    unsigned int i;
581    int size;
582 
583    switch (type->base_type) {
584    case GLSL_TYPE_UINT:
585    case GLSL_TYPE_INT:
586    case GLSL_TYPE_FLOAT:
587    case GLSL_TYPE_FLOAT16:
588    case GLSL_TYPE_BOOL:
589    case GLSL_TYPE_DOUBLE:
590    case GLSL_TYPE_UINT16:
591    case GLSL_TYPE_INT16:
592    case GLSL_TYPE_UINT64:
593    case GLSL_TYPE_INT64:
594       if (type->is_matrix()) {
595          const glsl_type *col_type = type->column_type();
596          unsigned col_slots =
597             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
598          return type->matrix_columns * col_slots;
599       } else {
600          /* Regardless of size of vector, it gets a vec4. This is bad
601           * packing for things like floats, but otherwise arrays become a
602           * mess.  Hopefully a later pass over the code can pack scalars
603           * down if appropriate.
604           */
605          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
606       }
607    case GLSL_TYPE_ARRAY:
608       assert(type->length > 0);
609       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
610    case GLSL_TYPE_STRUCT:
611       size = 0;
612       for (i = 0; i < type->length; i++) {
613 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
614       }
615       return size;
616    case GLSL_TYPE_SUBROUTINE:
617       return 1;
618 
619    case GLSL_TYPE_SAMPLER:
620       /* Samplers take up no register space, since they're baked in at
621        * link time.
622        */
623       return 0;
624    case GLSL_TYPE_ATOMIC_UINT:
625       return 0;
626    case GLSL_TYPE_IMAGE:
627       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
628    case GLSL_TYPE_VOID:
629    case GLSL_TYPE_ERROR:
630    case GLSL_TYPE_INTERFACE:
631    case GLSL_TYPE_FUNCTION:
632       unreachable("not reached");
633    }
634 
635    return 0;
636 }
637 
638 /**
639  * Returns the minimum number of vec4 elements needed to pack a type.
640  *
641  * For simple types, it will return 1 (a single vec4); for matrices, the
642  * number of columns; for array and struct, the sum of the vec4_size of
643  * each of its elements; and for sampler and atomic, zero.
644  *
645  * This method is useful to calculate how much register space is needed to
646  * store a particular type.
647  */
648 extern "C" int
type_size_vec4(const struct glsl_type * type)649 type_size_vec4(const struct glsl_type *type)
650 {
651    return type_size_xvec4(type, true);
652 }
653 
654 /**
655  * Returns the minimum number of dvec4 elements needed to pack a type.
656  *
657  * For simple types, it will return 1 (a single dvec4); for matrices, the
658  * number of columns; for array and struct, the sum of the dvec4_size of
659  * each of its elements; and for sampler and atomic, zero.
660  *
661  * This method is useful to calculate how much register space is needed to
662  * store a particular type.
663  *
664  * Measuring double-precision vertex inputs as dvec4 is required because
665  * ARB_vertex_attrib_64bit states that these uses the same number of locations
666  * than the single-precision version. That is, two consecutives dvec4 would be
667  * located in location "x" and location "x+1", not "x+2".
668  *
669  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
670  * remap_vs_attrs() will take in account both the location and also if the
671  * type fits in one or two vec4 slots.
672  */
673 extern "C" int
type_size_dvec4(const struct glsl_type * type)674 type_size_dvec4(const struct glsl_type *type)
675 {
676    return type_size_xvec4(type, false);
677 }
678 
src_reg(class vec4_visitor * v,const struct glsl_type * type)679 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
680 {
681    init();
682 
683    this->file = VGRF;
684    this->nr = v->alloc.allocate(type_size_vec4(type));
685 
686    if (type->is_array() || type->is_record()) {
687       this->swizzle = BRW_SWIZZLE_NOOP;
688    } else {
689       this->swizzle = brw_swizzle_for_size(type->vector_elements);
690    }
691 
692    this->type = brw_type_for_base_type(type);
693 }
694 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)695 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
696 {
697    assert(size > 0);
698 
699    init();
700 
701    this->file = VGRF;
702    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
703 
704    this->swizzle = BRW_SWIZZLE_NOOP;
705 
706    this->type = brw_type_for_base_type(type);
707 }
708 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)709 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
710 {
711    init();
712 
713    this->file = VGRF;
714    this->nr = v->alloc.allocate(type_size_vec4(type));
715 
716    if (type->is_array() || type->is_record()) {
717       this->writemask = WRITEMASK_XYZW;
718    } else {
719       this->writemask = (1 << type->vector_elements) - 1;
720    }
721 
722    this->type = brw_type_for_base_type(type);
723 }
724 
725 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)726 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
727                           src_reg src0, src_reg src1)
728 {
729    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
730    inst->conditional_mod = conditionalmod;
731    return inst;
732 }
733 
734 vec4_instruction *
emit_lrp(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)735 vec4_visitor::emit_lrp(const dst_reg &dst,
736                        const src_reg &x, const src_reg &y, const src_reg &a)
737 {
738    if (devinfo->gen >= 6) {
739       /* Note that the instruction's argument order is reversed from GLSL
740        * and the IR.
741        */
742      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
743                      fix_3src_operand(x)));
744    } else {
745       /* Earlier generations don't support three source operations, so we
746        * need to emit x*(1-a) + y*a.
747        */
748       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
749       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
750       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
751       y_times_a.writemask           = dst.writemask;
752       one_minus_a.writemask         = dst.writemask;
753       x_times_one_minus_a.writemask = dst.writemask;
754 
755       emit(MUL(y_times_a, y, a));
756       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
757       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
758       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
759    }
760 }
761 
762 /**
763  * Emits the instructions needed to perform a pull constant load. before_block
764  * and before_inst can be NULL in which case the instruction will be appended
765  * to the end of the instruction list.
766  */
767 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)768 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
769                                           src_reg surf_index,
770                                           src_reg offset_reg,
771                                           bblock_t *before_block,
772                                           vec4_instruction *before_inst)
773 {
774    assert((before_inst == NULL && before_block == NULL) ||
775           (before_inst && before_block));
776 
777    vec4_instruction *pull;
778 
779    if (devinfo->gen >= 9) {
780       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
781       src_reg header(this, glsl_type::uvec4_type, 2);
782 
783       pull = new(mem_ctx)
784          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
785                           dst_reg(header));
786 
787       if (before_inst)
788          emit_before(before_block, before_inst, pull);
789       else
790          emit(pull);
791 
792       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
793                                  offset_reg.type);
794       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
795 
796       if (before_inst)
797          emit_before(before_block, before_inst, pull);
798       else
799          emit(pull);
800 
801       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
802                                            dst,
803                                            surf_index,
804                                            header);
805       pull->mlen = 2;
806       pull->header_size = 1;
807    } else if (devinfo->gen >= 7) {
808       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
809 
810       grf_offset.type = offset_reg.type;
811 
812       pull = MOV(grf_offset, offset_reg);
813 
814       if (before_inst)
815          emit_before(before_block, before_inst, pull);
816       else
817          emit(pull);
818 
819       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
820                                            dst,
821                                            surf_index,
822                                            src_reg(grf_offset));
823       pull->mlen = 1;
824    } else {
825       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
826                                            dst,
827                                            surf_index,
828                                            offset_reg);
829       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
830       pull->mlen = 1;
831    }
832 
833    if (before_inst)
834       emit_before(before_block, before_inst, pull);
835    else
836       emit(pull);
837 }
838 
839 src_reg
emit_uniformize(const src_reg & src)840 vec4_visitor::emit_uniformize(const src_reg &src)
841 {
842    const src_reg chan_index(this, glsl_type::uint_type);
843    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
844                               src.type);
845 
846    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
847       ->force_writemask_all = true;
848    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
849       ->force_writemask_all = true;
850 
851    return src_reg(dst);
852 }
853 
854 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)855 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
856                              src_reg coordinate, src_reg surface)
857 {
858    vec4_instruction *inst =
859       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
860                                     dst_reg(this, glsl_type::uvec4_type));
861    inst->base_mrf = 2;
862    inst->src[1] = surface;
863    inst->src[2] = surface;
864 
865    int param_base;
866 
867    if (devinfo->gen >= 9) {
868       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
869       vec4_instruction *header_inst = new(mem_ctx)
870          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
871                           dst_reg(MRF, inst->base_mrf));
872 
873       emit(header_inst);
874 
875       inst->mlen = 2;
876       inst->header_size = 1;
877       param_base = inst->base_mrf + 1;
878    } else {
879       inst->mlen = 1;
880       param_base = inst->base_mrf;
881    }
882 
883    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
884    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
885    int zero_mask = 0xf & ~coord_mask;
886 
887    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
888             coordinate));
889 
890    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
891             brw_imm_d(0)));
892 
893    emit(inst);
894    return src_reg(inst->dst);
895 }
896 
897 bool
is_high_sampler(src_reg sampler)898 vec4_visitor::is_high_sampler(src_reg sampler)
899 {
900    if (devinfo->gen < 8 && !devinfo->is_haswell)
901       return false;
902 
903    return sampler.file != IMM || sampler.ud >= 16;
904 }
905 
906 void
emit_texture(ir_texture_opcode op,dst_reg dest,const glsl_type * dest_type,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)907 vec4_visitor::emit_texture(ir_texture_opcode op,
908                            dst_reg dest,
909                            const glsl_type *dest_type,
910                            src_reg coordinate,
911                            int coord_components,
912                            src_reg shadow_comparator,
913                            src_reg lod, src_reg lod2,
914                            src_reg sample_index,
915                            uint32_t constant_offset,
916                            src_reg offset_value,
917                            src_reg mcs,
918                            uint32_t surface,
919                            src_reg surface_reg,
920                            src_reg sampler_reg)
921 {
922    enum opcode opcode;
923    switch (op) {
924    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
925    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
926    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
927    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
928    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
929                              SHADER_OPCODE_TXF_CMS); break;
930    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
931    case ir_tg4: opcode = offset_value.file != BAD_FILE
932                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
933    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
934    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
935    case ir_txb:
936       unreachable("TXB is not valid for vertex shaders.");
937    case ir_lod:
938       unreachable("LOD is not valid for vertex shaders.");
939    case ir_samples_identical: {
940       /* There are some challenges implementing this for vec4, and it seems
941        * unlikely to be used anyway.  For now, just return false ways.
942        */
943       emit(MOV(dest, brw_imm_ud(0u)));
944       return;
945    }
946    default:
947       unreachable("Unrecognized tex op");
948    }
949 
950    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
951 
952    inst->offset = constant_offset;
953 
954    /* The message header is necessary for:
955     * - Gen4 (always)
956     * - Gen9+ for selecting SIMD4x2
957     * - Texel offsets
958     * - Gather channel selection
959     * - Sampler indices too large to fit in a 4-bit value.
960     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
961     */
962    inst->header_size =
963       (devinfo->gen < 5 || devinfo->gen >= 9 ||
964        inst->offset != 0 || op == ir_tg4 ||
965        op == ir_texture_samples ||
966        is_high_sampler(sampler_reg)) ? 1 : 0;
967    inst->base_mrf = 2;
968    inst->mlen = inst->header_size;
969    inst->dst.writemask = WRITEMASK_XYZW;
970    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
971 
972    inst->src[1] = surface_reg;
973    inst->src[2] = sampler_reg;
974 
975    /* MRF for the first parameter */
976    int param_base = inst->base_mrf + inst->header_size;
977 
978    if (op == ir_txs || op == ir_query_levels) {
979       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
980       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
981       inst->mlen++;
982    } else if (op == ir_texture_samples) {
983       inst->dst.writemask = WRITEMASK_X;
984    } else {
985       /* Load the coordinate */
986       /* FINISHME: gl_clamp_mask and saturate */
987       int coord_mask = (1 << coord_components) - 1;
988       int zero_mask = 0xf & ~coord_mask;
989 
990       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
991                coordinate));
992       inst->mlen++;
993 
994       if (zero_mask != 0) {
995          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
996                   brw_imm_d(0)));
997       }
998       /* Load the shadow comparator */
999       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1000 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1001 			  WRITEMASK_X),
1002 		  shadow_comparator));
1003 	 inst->mlen++;
1004       }
1005 
1006       /* Load the LOD info */
1007       if (op == ir_tex || op == ir_txl) {
1008 	 int mrf, writemask;
1009 	 if (devinfo->gen >= 5) {
1010 	    mrf = param_base + 1;
1011 	    if (shadow_comparator.file != BAD_FILE) {
1012 	       writemask = WRITEMASK_Y;
1013 	       /* mlen already incremented */
1014 	    } else {
1015 	       writemask = WRITEMASK_X;
1016 	       inst->mlen++;
1017 	    }
1018 	 } else /* devinfo->gen == 4 */ {
1019 	    mrf = param_base;
1020 	    writemask = WRITEMASK_W;
1021 	 }
1022 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1023       } else if (op == ir_txf) {
1024          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1025       } else if (op == ir_txf_ms) {
1026          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1027                   sample_index));
1028          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1029             /* MCS data is stored in the first two channels of ‘mcs’, but we
1030              * need to get it into the .y and .z channels of the second vec4
1031              * of params.
1032              */
1033             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1034             emit(MOV(dst_reg(MRF, param_base + 1,
1035                              glsl_type::uint_type, WRITEMASK_YZ),
1036                      mcs));
1037          } else if (devinfo->gen >= 7) {
1038             /* MCS data is in the first channel of `mcs`, but we need to get it into
1039              * the .y channel of the second vec4 of params, so replicate .x across
1040              * the whole vec4 and then mask off everything except .y
1041              */
1042             mcs.swizzle = BRW_SWIZZLE_XXXX;
1043             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1044                      mcs));
1045          }
1046          inst->mlen++;
1047       } else if (op == ir_txd) {
1048          const brw_reg_type type = lod.type;
1049 
1050 	 if (devinfo->gen >= 5) {
1051 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1052 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1053 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1054 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1055 	    inst->mlen++;
1056 
1057 	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1058 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
1059 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1060 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1061 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1062 	       inst->mlen++;
1063 
1064                if (shadow_comparator.file != BAD_FILE) {
1065                   emit(MOV(dst_reg(MRF, param_base + 2,
1066                                    shadow_comparator.type, WRITEMASK_Z),
1067                            shadow_comparator));
1068                }
1069 	    }
1070 	 } else /* devinfo->gen == 4 */ {
1071 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1072 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1073 	    inst->mlen += 2;
1074 	 }
1075       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1076          if (shadow_comparator.file != BAD_FILE) {
1077             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1078                      shadow_comparator));
1079          }
1080 
1081          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1082                   offset_value));
1083          inst->mlen++;
1084       }
1085    }
1086 
1087    emit(inst);
1088 
1089    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1090     * spec requires layers.
1091     */
1092    if (op == ir_txs && devinfo->gen < 7) {
1093       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1094       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1095                   src_reg(inst->dst), brw_imm_d(1));
1096    }
1097 
1098    if (devinfo->gen == 6 && op == ir_tg4) {
1099       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1100    }
1101 
1102    if (op == ir_query_levels) {
1103       /* # levels is in .w */
1104       src_reg swizzled(dest);
1105       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1106                                       SWIZZLE_W, SWIZZLE_W);
1107       emit(MOV(dest, swizzled));
1108    }
1109 }
1110 
1111 /**
1112  * Apply workarounds for Gen6 gather with UINT/SINT
1113  */
1114 void
emit_gen6_gather_wa(uint8_t wa,dst_reg dst)1115 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1116 {
1117    if (!wa)
1118       return;
1119 
1120    int width = (wa & WA_8BIT) ? 8 : 16;
1121    dst_reg dst_f = dst;
1122    dst_f.type = BRW_REGISTER_TYPE_F;
1123 
1124    /* Convert from UNORM to UINT */
1125    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1126    emit(MOV(dst, src_reg(dst_f)));
1127 
1128    if (wa & WA_SIGN) {
1129       /* Reinterpret the UINT value as a signed INT value by
1130        * shifting the sign bit into place, then shifting back
1131        * preserving sign.
1132        */
1133       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1134       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1135    }
1136 }
1137 
1138 void
gs_emit_vertex(int)1139 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1140 {
1141    unreachable("not reached");
1142 }
1143 
1144 void
gs_end_primitive()1145 vec4_visitor::gs_end_primitive()
1146 {
1147    unreachable("not reached");
1148 }
1149 
1150 void
emit_ndc_computation()1151 vec4_visitor::emit_ndc_computation()
1152 {
1153    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1154       return;
1155 
1156    /* Get the position */
1157    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1158 
1159    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1160    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1161    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1162    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1163 
1164    current_annotation = "NDC";
1165    dst_reg ndc_w = ndc;
1166    ndc_w.writemask = WRITEMASK_W;
1167    src_reg pos_w = pos;
1168    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1169    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1170 
1171    dst_reg ndc_xyz = ndc;
1172    ndc_xyz.writemask = WRITEMASK_XYZ;
1173 
1174    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1175 }
1176 
1177 void
emit_psiz_and_flags(dst_reg reg)1178 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1179 {
1180    if (devinfo->gen < 6 &&
1181        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1182         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1183         devinfo->has_negative_rhw_bug)) {
1184       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1185       dst_reg header1_w = header1;
1186       header1_w.writemask = WRITEMASK_W;
1187 
1188       emit(MOV(header1, brw_imm_ud(0u)));
1189 
1190       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1191 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1192 
1193 	 current_annotation = "Point size";
1194 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1195 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1196       }
1197 
1198       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1199          current_annotation = "Clipping flags";
1200          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1201          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1202 
1203          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1204          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1205          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1206 
1207          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1209          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1210          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1211       }
1212 
1213       /* i965 clipping workaround:
1214        * 1) Test for -ve rhw
1215        * 2) If set,
1216        *      set ndc = (0,0,0,0)
1217        *      set ucp[6] = 1
1218        *
1219        * Later, clipping will detect ucp[6] and ensure the primitive is
1220        * clipped against all fixed planes.
1221        */
1222       if (devinfo->has_negative_rhw_bug &&
1223           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1224          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1225          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1226          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1227          vec4_instruction *inst;
1228          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1229          inst->predicate = BRW_PREDICATE_NORMAL;
1230          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1231          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1232          inst->predicate = BRW_PREDICATE_NORMAL;
1233       }
1234 
1235       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1236    } else if (devinfo->gen < 6) {
1237       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1238    } else {
1239       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1240       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1241          dst_reg reg_w = reg;
1242          reg_w.writemask = WRITEMASK_W;
1243          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1244          reg_as_src.type = reg_w.type;
1245          reg_as_src.swizzle = brw_swizzle_for_size(1);
1246          emit(MOV(reg_w, reg_as_src));
1247       }
1248       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1249          dst_reg reg_y = reg;
1250          reg_y.writemask = WRITEMASK_Y;
1251          reg_y.type = BRW_REGISTER_TYPE_D;
1252          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1253          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1254       }
1255       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1256          dst_reg reg_z = reg;
1257          reg_z.writemask = WRITEMASK_Z;
1258          reg_z.type = BRW_REGISTER_TYPE_D;
1259          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1260          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1261       }
1262    }
1263 }
1264 
1265 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1266 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1267 {
1268    assert(varying < VARYING_SLOT_MAX);
1269 
1270    unsigned num_comps = output_num_components[varying][component];
1271    if (num_comps == 0)
1272       return NULL;
1273 
1274    assert(output_reg[varying][component].type == reg.type);
1275    current_annotation = output_reg_annotation[varying];
1276    if (output_reg[varying][component].file != BAD_FILE) {
1277       src_reg src = src_reg(output_reg[varying][component]);
1278       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1279       reg.writemask =
1280          brw_writemask_for_component_packing(num_comps, component);
1281       return emit(MOV(reg, src));
1282    }
1283    return NULL;
1284 }
1285 
1286 void
emit_urb_slot(dst_reg reg,int varying)1287 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1288 {
1289    reg.type = BRW_REGISTER_TYPE_F;
1290    output_reg[varying][0].type = reg.type;
1291 
1292    switch (varying) {
1293    case VARYING_SLOT_PSIZ:
1294    {
1295       /* PSIZ is always in slot 0, and is coupled with other flags. */
1296       current_annotation = "indices, point width, clip flags";
1297       emit_psiz_and_flags(reg);
1298       break;
1299    }
1300    case BRW_VARYING_SLOT_NDC:
1301       current_annotation = "NDC";
1302       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1303          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1304       break;
1305    case VARYING_SLOT_POS:
1306       current_annotation = "gl_Position";
1307       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1308          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1309       break;
1310    case VARYING_SLOT_EDGE: {
1311       /* This is present when doing unfilled polygons.  We're supposed to copy
1312        * the edge flag from the user-provided vertex array
1313        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1314        * of that attribute (starts as 1.0f).  This is then used in clipping to
1315        * determine which edges should be drawn as wireframe.
1316        */
1317       current_annotation = "edge flag";
1318       int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1319                                         BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1320       emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1321                                     glsl_type::float_type, WRITEMASK_XYZW))));
1322       break;
1323    }
1324    case BRW_VARYING_SLOT_PAD:
1325       /* No need to write to this slot */
1326       break;
1327    default:
1328       for (int i = 0; i < 4; i++) {
1329          emit_generic_urb_slot(reg, varying, i);
1330       }
1331       break;
1332    }
1333 }
1334 
1335 static int
align_interleaved_urb_mlen(const struct gen_device_info * devinfo,int mlen)1336 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1337 {
1338    if (devinfo->gen >= 6) {
1339       /* URB data written (does not include the message header reg) must
1340        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1341        * section 5.4.3.2.2: URB_INTERLEAVED.
1342        *
1343        * URB entries are allocated on a multiple of 1024 bits, so an
1344        * extra 128 bits written here to make the end align to 256 is
1345        * no problem.
1346        */
1347       if ((mlen % 2) != 1)
1348 	 mlen++;
1349    }
1350 
1351    return mlen;
1352 }
1353 
1354 
1355 /**
1356  * Generates the VUE payload plus the necessary URB write instructions to
1357  * output it.
1358  *
1359  * The VUE layout is documented in Volume 2a.
1360  */
1361 void
emit_vertex()1362 vec4_visitor::emit_vertex()
1363 {
1364    /* MRF 0 is reserved for the debugger, so start with message header
1365     * in MRF 1.
1366     */
1367    int base_mrf = 1;
1368    int mrf = base_mrf;
1369    /* In the process of generating our URB write message contents, we
1370     * may need to unspill a register or load from an array.  Those
1371     * reads would use MRFs 14-15.
1372     */
1373    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1374 
1375    /* The following assertion verifies that max_usable_mrf causes an
1376     * even-numbered amount of URB write data, which will meet gen6's
1377     * requirements for length alignment.
1378     */
1379    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1380 
1381    /* First mrf is the g0-based message header containing URB handles and
1382     * such.
1383     */
1384    emit_urb_write_header(mrf++);
1385 
1386    if (devinfo->gen < 6) {
1387       emit_ndc_computation();
1388    }
1389 
1390    /* We may need to split this up into several URB writes, so do them in a
1391     * loop.
1392     */
1393    int slot = 0;
1394    bool complete = false;
1395    do {
1396       /* URB offset is in URB row increments, and each of our MRFs is half of
1397        * one of those, since we're doing interleaved writes.
1398        */
1399       int offset = slot / 2;
1400 
1401       mrf = base_mrf + 1;
1402       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1403          emit_urb_slot(dst_reg(MRF, mrf++),
1404                        prog_data->vue_map.slot_to_varying[slot]);
1405 
1406          /* If this was max_usable_mrf, we can't fit anything more into this
1407           * URB WRITE. Same thing if we reached the maximum length available.
1408           */
1409          if (mrf > max_usable_mrf ||
1410              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1411             slot++;
1412             break;
1413          }
1414       }
1415 
1416       complete = slot >= prog_data->vue_map.num_slots;
1417       current_annotation = "URB write";
1418       vec4_instruction *inst = emit_urb_write_opcode(complete);
1419       inst->base_mrf = base_mrf;
1420       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1421       inst->offset += offset;
1422    } while(!complete);
1423 }
1424 
1425 
1426 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1427 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1428 				 src_reg *reladdr, int reg_offset)
1429 {
1430    /* Because we store the values to scratch interleaved like our
1431     * vertex data, we need to scale the vec4 index by 2.
1432     */
1433    int message_header_scale = 2;
1434 
1435    /* Pre-gen6, the message header uses byte offsets instead of vec4
1436     * (16-byte) offset units.
1437     */
1438    if (devinfo->gen < 6)
1439       message_header_scale *= 16;
1440 
1441    if (reladdr) {
1442       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1443        * to multiply the reladdr by 2. Notice that the reg_offset part
1444        * is in units of 16 bytes and is used to select the low/high 16-byte
1445        * chunk of a full dvec4, so we don't want to multiply that part.
1446        */
1447       src_reg index = src_reg(this, glsl_type::int_type);
1448       if (type_sz(inst->dst.type) < 8) {
1449          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1450                                       brw_imm_d(reg_offset)));
1451          emit_before(block, inst, MUL(dst_reg(index), index,
1452                                       brw_imm_d(message_header_scale)));
1453       } else {
1454          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1455                                       brw_imm_d(message_header_scale * 2)));
1456          emit_before(block, inst, ADD(dst_reg(index), index,
1457                                       brw_imm_d(reg_offset * message_header_scale)));
1458       }
1459       return index;
1460    } else {
1461       return brw_imm_d(reg_offset * message_header_scale);
1462    }
1463 }
1464 
1465 /**
1466  * Emits an instruction before @inst to load the value named by @orig_src
1467  * from scratch space at @base_offset to @temp.
1468  *
1469  * @base_offset is measured in 32-byte units (the size of a register).
1470  */
1471 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1472 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1473 				dst_reg temp, src_reg orig_src,
1474 				int base_offset)
1475 {
1476    assert(orig_src.offset % REG_SIZE == 0);
1477    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1478    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1479                                       reg_offset);
1480 
1481    if (type_sz(orig_src.type) < 8) {
1482       emit_before(block, inst, SCRATCH_READ(temp, index));
1483    } else {
1484       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1485       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1486       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1487       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1488       vec4_instruction *last_read =
1489          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1490       emit_before(block, inst, last_read);
1491       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1492    }
1493 }
1494 
1495 /**
1496  * Emits an instruction after @inst to store the value to be written
1497  * to @orig_dst to scratch space at @base_offset, from @temp.
1498  *
1499  * @base_offset is measured in 32-byte units (the size of a register).
1500  */
1501 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1502 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1503                                  int base_offset)
1504 {
1505    assert(inst->dst.offset % REG_SIZE == 0);
1506    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1507    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1508                                       reg_offset);
1509 
1510    /* Create a temporary register to store *inst's result in.
1511     *
1512     * We have to be careful in MOVing from our temporary result register in
1513     * the scratch write.  If we swizzle from channels of the temporary that
1514     * weren't initialized, it will confuse live interval analysis, which will
1515     * make spilling fail to make progress.
1516     */
1517    bool is_64bit = type_sz(inst->dst.type) == 8;
1518    const glsl_type *alloc_type =
1519       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1520    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1521                                        inst->dst.type),
1522                                 brw_swizzle_for_mask(inst->dst.writemask));
1523 
1524    if (!is_64bit) {
1525       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1526 				          inst->dst.writemask));
1527       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1528       if (inst->opcode != BRW_OPCODE_SEL)
1529          write->predicate = inst->predicate;
1530       write->ir = inst->ir;
1531       write->annotation = inst->annotation;
1532       inst->insert_after(block, write);
1533    } else {
1534       dst_reg shuffled = dst_reg(this, alloc_type);
1535       vec4_instruction *last =
1536          shuffle_64bit_data(shuffled, temp, true, block, inst);
1537       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1538 
1539       uint8_t mask = 0;
1540       if (inst->dst.writemask & WRITEMASK_X)
1541          mask |= WRITEMASK_XY;
1542       if (inst->dst.writemask & WRITEMASK_Y)
1543          mask |= WRITEMASK_ZW;
1544       if (mask) {
1545          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1546 
1547          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1548          if (inst->opcode != BRW_OPCODE_SEL)
1549             write->predicate = inst->predicate;
1550          write->ir = inst->ir;
1551          write->annotation = inst->annotation;
1552          last->insert_after(block, write);
1553       }
1554 
1555       mask = 0;
1556       if (inst->dst.writemask & WRITEMASK_Z)
1557          mask |= WRITEMASK_XY;
1558       if (inst->dst.writemask & WRITEMASK_W)
1559          mask |= WRITEMASK_ZW;
1560       if (mask) {
1561          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1562 
1563          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1564                                             reg_offset + 1);
1565          vec4_instruction *write =
1566             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1567          if (inst->opcode != BRW_OPCODE_SEL)
1568             write->predicate = inst->predicate;
1569          write->ir = inst->ir;
1570          write->annotation = inst->annotation;
1571          last->insert_after(block, write);
1572       }
1573    }
1574 
1575    inst->dst.file = temp.file;
1576    inst->dst.nr = temp.nr;
1577    inst->dst.offset %= REG_SIZE;
1578    inst->dst.reladdr = NULL;
1579 }
1580 
1581 /**
1582  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1583  * adds the scratch read(s) before \p inst. The function also checks for
1584  * recursive reladdr scratch accesses, issuing the corresponding scratch
1585  * loads and rewriting reladdr references accordingly.
1586  *
1587  * \return \p src if it did not require a scratch load, otherwise, the
1588  * register holding the result of the scratch load that the caller should
1589  * use to rewrite src.
1590  */
1591 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1592 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1593                                    vec4_instruction *inst, src_reg src)
1594 {
1595    /* Resolve recursive reladdr scratch access by calling ourselves
1596     * with src.reladdr
1597     */
1598    if (src.reladdr)
1599       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1600                                           *src.reladdr);
1601 
1602    /* Now handle scratch access on src */
1603    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1604       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1605          glsl_type::dvec4_type : glsl_type::vec4_type);
1606       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1607       src.nr = temp.nr;
1608       src.offset %= REG_SIZE;
1609       src.reladdr = NULL;
1610    }
1611 
1612    return src;
1613 }
1614 
1615 /**
1616  * We can't generally support array access in GRF space, because a
1617  * single instruction's destination can only span 2 contiguous
1618  * registers.  So, we send all GRF arrays that get variable index
1619  * access to scratch space.
1620  */
1621 void
move_grf_array_access_to_scratch()1622 vec4_visitor::move_grf_array_access_to_scratch()
1623 {
1624    int scratch_loc[this->alloc.count];
1625    memset(scratch_loc, -1, sizeof(scratch_loc));
1626 
1627    /* First, calculate the set of virtual GRFs that need to be punted
1628     * to scratch due to having any array access on them, and where in
1629     * scratch.
1630     */
1631    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1632       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1633          if (scratch_loc[inst->dst.nr] == -1) {
1634             scratch_loc[inst->dst.nr] = last_scratch;
1635             last_scratch += this->alloc.sizes[inst->dst.nr];
1636          }
1637 
1638          for (src_reg *iter = inst->dst.reladdr;
1639               iter->reladdr;
1640               iter = iter->reladdr) {
1641             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1642                scratch_loc[iter->nr] = last_scratch;
1643                last_scratch += this->alloc.sizes[iter->nr];
1644             }
1645          }
1646       }
1647 
1648       for (int i = 0 ; i < 3; i++) {
1649          for (src_reg *iter = &inst->src[i];
1650               iter->reladdr;
1651               iter = iter->reladdr) {
1652             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1653                scratch_loc[iter->nr] = last_scratch;
1654                last_scratch += this->alloc.sizes[iter->nr];
1655             }
1656          }
1657       }
1658    }
1659 
1660    /* Now, for anything that will be accessed through scratch, rewrite
1661     * it to load/store.  Note that this is a _safe list walk, because
1662     * we may generate a new scratch_write instruction after the one
1663     * we're processing.
1664     */
1665    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1666       /* Set up the annotation tracking for new generated instructions. */
1667       base_ir = inst->ir;
1668       current_annotation = inst->annotation;
1669 
1670       /* First handle scratch access on the dst. Notice we have to handle
1671        * the case where the dst's reladdr also points to scratch space.
1672        */
1673       if (inst->dst.reladdr)
1674          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1675                                                    *inst->dst.reladdr);
1676 
1677       /* Now that we have handled any (possibly recursive) reladdr scratch
1678        * accesses for dst we can safely do the scratch write for dst itself
1679        */
1680       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1681          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1682 
1683       /* Now handle scratch access on any src. In this case, since inst->src[i]
1684        * already is a src_reg, we can just call emit_resolve_reladdr with
1685        * inst->src[i] and it will take care of handling scratch loads for
1686        * both src and src.reladdr (recursively).
1687        */
1688       for (int i = 0 ; i < 3; i++) {
1689          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1690                                              inst->src[i]);
1691       }
1692    }
1693 }
1694 
1695 /**
1696  * Emits an instruction before @inst to load the value named by @orig_src
1697  * from the pull constant buffer (surface) at @base_offset to @temp.
1698  */
1699 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1700 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1701                                       dst_reg temp, src_reg orig_src,
1702                                       int base_offset, src_reg indirect)
1703 {
1704    assert(orig_src.offset % 16 == 0);
1705    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1706 
1707    /* For 64bit loads we need to emit two 32-bit load messages and we also
1708     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1709     * that we emit the 32-bit loads into a temporary and we shuffle the result
1710     * into the original destination.
1711     */
1712    dst_reg orig_temp = temp;
1713    bool is_64bit = type_sz(orig_src.type) == 8;
1714    if (is_64bit) {
1715       assert(type_sz(temp.type) == 8);
1716       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1717       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1718    }
1719 
1720    src_reg src = orig_src;
1721    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1722       int reg_offset = base_offset + src.offset / 16;
1723 
1724       src_reg offset;
1725       if (indirect.file != BAD_FILE) {
1726          offset = src_reg(this, glsl_type::uint_type);
1727          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1728                                       brw_imm_ud(reg_offset * 16)));
1729       } else if (devinfo->gen >= 8) {
1730          /* Store the offset in a GRF so we can send-from-GRF. */
1731          offset = src_reg(this, glsl_type::uint_type);
1732          emit_before(block, inst, MOV(dst_reg(offset),
1733                                       brw_imm_ud(reg_offset * 16)));
1734       } else {
1735          offset = brw_imm_d(reg_offset * 16);
1736       }
1737 
1738       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1739                                   brw_imm_ud(index),
1740                                   offset,
1741                                   block, inst);
1742 
1743       src = byte_offset(src, 16);
1744    }
1745 
1746    brw_mark_surface_used(&prog_data->base, index);
1747 
1748    if (is_64bit) {
1749       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1750       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1751    }
1752 }
1753 
1754 /**
1755  * Implements array access of uniforms by inserting a
1756  * PULL_CONSTANT_LOAD instruction.
1757  *
1758  * Unlike temporary GRF array access (where we don't support it due to
1759  * the difficulty of doing relative addressing on instruction
1760  * destinations), we could potentially do array access of uniforms
1761  * that were loaded in GRF space as push constants.  In real-world
1762  * usage we've seen, though, the arrays being used are always larger
1763  * than we could load as push constants, so just always move all
1764  * uniform array access out to a pull constant buffer.
1765  */
1766 void
move_uniform_array_access_to_pull_constants()1767 vec4_visitor::move_uniform_array_access_to_pull_constants()
1768 {
1769    /* The vulkan dirver doesn't support pull constants other than UBOs so
1770     * everything has to be pushed regardless.
1771     */
1772    if (!compiler->supports_pull_constants) {
1773       split_uniform_registers();
1774       return;
1775    }
1776 
1777    /* Allocate the pull_params array */
1778    assert(stage_prog_data->nr_pull_params == 0);
1779    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1780                                               this->uniforms * 4);
1781 
1782    int pull_constant_loc[this->uniforms];
1783    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1784 
1785    /* First, walk through the instructions and determine which things need to
1786     * be pulled.  We mark something as needing to be pulled by setting
1787     * pull_constant_loc to 0.
1788     */
1789    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1790       /* We only care about MOV_INDIRECT of a uniform */
1791       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1792           inst->src[0].file != UNIFORM)
1793          continue;
1794 
1795       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1796 
1797       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1798          pull_constant_loc[uniform_nr + j] = 0;
1799    }
1800 
1801    /* Next, we walk the list of uniforms and assign real pull constant
1802     * locations and set their corresponding entries in pull_param.
1803     */
1804    for (int j = 0; j < this->uniforms; j++) {
1805       if (pull_constant_loc[j] < 0)
1806          continue;
1807 
1808       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1809 
1810       for (int i = 0; i < 4; i++) {
1811          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1812             = stage_prog_data->param[j * 4 + i];
1813       }
1814    }
1815 
1816    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1817     * instructions to actual uniform pulls.
1818     */
1819    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1820       /* We only care about MOV_INDIRECT of a uniform */
1821       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1822           inst->src[0].file != UNIFORM)
1823          continue;
1824 
1825       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1826 
1827       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1828 
1829       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1830                               pull_constant_loc[uniform_nr], inst->src[1]);
1831       inst->remove(block);
1832    }
1833 
1834    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1835     * no need to track them as larger-than-vec4 objects.  This will be
1836     * relied on in cutting out unused uniform vectors from push
1837     * constants.
1838     */
1839    split_uniform_registers();
1840 }
1841 
1842 void
resolve_ud_negate(src_reg * reg)1843 vec4_visitor::resolve_ud_negate(src_reg *reg)
1844 {
1845    if (reg->type != BRW_REGISTER_TYPE_UD ||
1846        !reg->negate)
1847       return;
1848 
1849    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1850    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1851    *reg = temp;
1852 }
1853 
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index)1854 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1855                            void *log_data,
1856                            const struct brw_sampler_prog_key_data *key_tex,
1857                            struct brw_vue_prog_data *prog_data,
1858                            const nir_shader *shader,
1859 			   void *mem_ctx,
1860                            bool no_spills,
1861                            int shader_time_index)
1862    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1863      key_tex(key_tex),
1864      prog_data(prog_data),
1865      fail_msg(NULL),
1866      first_non_payload_grf(0),
1867      need_all_constants_in_pull_buffer(false),
1868      no_spills(no_spills),
1869      shader_time_index(shader_time_index),
1870      last_scratch(0)
1871 {
1872    this->failed = false;
1873 
1874    this->base_ir = NULL;
1875    this->current_annotation = NULL;
1876    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1877 
1878    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1879 
1880    this->virtual_grf_start = NULL;
1881    this->virtual_grf_end = NULL;
1882    this->live_intervals = NULL;
1883 
1884    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1885 
1886    this->uniforms = 0;
1887 }
1888 
1889 
1890 void
fail(const char * format,...)1891 vec4_visitor::fail(const char *format, ...)
1892 {
1893    va_list va;
1894    char *msg;
1895 
1896    if (failed)
1897       return;
1898 
1899    failed = true;
1900 
1901    va_start(va, format);
1902    msg = ralloc_vasprintf(mem_ctx, format, va);
1903    va_end(va);
1904    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1905 
1906    this->fail_msg = msg;
1907 
1908    if (debug_enabled) {
1909       fprintf(stderr, "%s",  msg);
1910    }
1911 }
1912 
1913 } /* namespace brw */
1914