1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30 
31 namespace brw {
32 
vec4_instruction(vec4_visitor * v,enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1,src_reg src2)33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 				   enum opcode opcode, dst_reg dst,
35 				   src_reg src0, src_reg src1, src_reg src2)
36 {
37    this->opcode = opcode;
38    this->dst = dst;
39    this->src[0] = src0;
40    this->src[1] = src1;
41    this->src[2] = src2;
42    this->ir = v->base_ir;
43    this->annotation = v->current_annotation;
44 }
45 
46 vec4_instruction *
emit(vec4_instruction * inst)47 vec4_visitor::emit(vec4_instruction *inst)
48 {
49    this->instructions.push_tail(inst);
50 
51    return inst;
52 }
53 
54 vec4_instruction *
emit_before(vec4_instruction * inst,vec4_instruction * new_inst)55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
56 {
57    new_inst->ir = inst->ir;
58    new_inst->annotation = inst->annotation;
59 
60    inst->insert_before(new_inst);
61 
62    return inst;
63 }
64 
65 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1,src_reg src2)66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
67 		   src_reg src0, src_reg src1, src_reg src2)
68 {
69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
70 					     src0, src1, src2));
71 }
72 
73 
74 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
76 {
77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
78 }
79 
80 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0)81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
82 {
83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
84 }
85 
86 vec4_instruction *
emit(enum opcode opcode)87 vec4_visitor::emit(enum opcode opcode)
88 {
89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
90 }
91 
92 #define ALU1(op)							\
93    vec4_instruction *							\
94    vec4_visitor::op(dst_reg dst, src_reg src0)				\
95    {									\
96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
97 					   src0);			\
98    }
99 
100 #define ALU2(op)							\
101    vec4_instruction *							\
102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
103    {									\
104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
105 					   src0, src1);			\
106    }
107 
108 ALU1(NOT)
ALU1(MOV)109 ALU1(MOV)
110 ALU1(FRC)
111 ALU1(RNDD)
112 ALU1(RNDE)
113 ALU1(RNDZ)
114 ALU2(ADD)
115 ALU2(MUL)
116 ALU2(MACH)
117 ALU2(AND)
118 ALU2(OR)
119 ALU2(XOR)
120 ALU2(DP3)
121 ALU2(DP4)
122 
123 /** Gen4 predicated IF. */
124 vec4_instruction *
125 vec4_visitor::IF(uint32_t predicate)
126 {
127    vec4_instruction *inst;
128 
129    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
130    inst->predicate = predicate;
131 
132    return inst;
133 }
134 
135 /** Gen6+ IF with embedded comparison. */
136 vec4_instruction *
IF(src_reg src0,src_reg src1,uint32_t condition)137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
138 {
139    assert(intel->gen >= 6);
140 
141    vec4_instruction *inst;
142 
143    resolve_ud_negate(&src0);
144    resolve_ud_negate(&src1);
145 
146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
147 					src0, src1);
148    inst->conditional_mod = condition;
149 
150    return inst;
151 }
152 
153 /**
154  * CMP: Sets the low bit of the destination channels with the result
155  * of the comparison, while the upper bits are undefined, and updates
156  * the flag register with the packed 16 bits of the result.
157  */
158 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,uint32_t condition)159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
160 {
161    vec4_instruction *inst;
162 
163    /* original gen4 does type conversion to the destination type
164     * before before comparison, producing garbage results for floating
165     * point comparisons.
166     */
167    if (intel->gen == 4) {
168       dst.type = src0.type;
169       if (dst.file == HW_REG)
170 	 dst.fixed_hw_reg.type = dst.type;
171    }
172 
173    resolve_ud_negate(&src0);
174    resolve_ud_negate(&src1);
175 
176    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
177    inst->conditional_mod = condition;
178 
179    return inst;
180 }
181 
182 vec4_instruction *
SCRATCH_READ(dst_reg dst,src_reg index)183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
184 {
185    vec4_instruction *inst;
186 
187    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
188 					dst, index);
189    inst->base_mrf = 14;
190    inst->mlen = 1;
191 
192    return inst;
193 }
194 
195 vec4_instruction *
SCRATCH_WRITE(dst_reg dst,src_reg src,src_reg index)196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
197 {
198    vec4_instruction *inst;
199 
200    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
201 					dst, src, index);
202    inst->base_mrf = 13;
203    inst->mlen = 2;
204 
205    return inst;
206 }
207 
208 void
emit_dp(dst_reg dst,src_reg src0,src_reg src1,unsigned elements)209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
210 {
211    static enum opcode dot_opcodes[] = {
212       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
213    };
214 
215    emit(dot_opcodes[elements - 2], dst, src0, src1);
216 }
217 
218 void
emit_math1_gen6(enum opcode opcode,dst_reg dst,src_reg src)219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
220 {
221    /* The gen6 math instruction ignores the source modifiers --
222     * swizzle, abs, negate, and at least some parts of the register
223     * region description.
224     *
225     * While it would seem that this MOV could be avoided at this point
226     * in the case that the swizzle is matched up with the destination
227     * writemask, note that uniform packing and register allocation
228     * could rearrange our swizzle, so let's leave this matter up to
229     * copy propagation later.
230     */
231    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
232    emit(MOV(dst_reg(temp_src), src));
233 
234    if (dst.writemask != WRITEMASK_XYZW) {
235       /* The gen6 math instruction must be align1, so we can't do
236        * writemasks.
237        */
238       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
239 
240       emit(opcode, temp_dst, temp_src);
241 
242       emit(MOV(dst, src_reg(temp_dst)));
243    } else {
244       emit(opcode, dst, temp_src);
245    }
246 }
247 
248 void
emit_math1_gen4(enum opcode opcode,dst_reg dst,src_reg src)249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
250 {
251    vec4_instruction *inst = emit(opcode, dst, src);
252    inst->base_mrf = 1;
253    inst->mlen = 1;
254 }
255 
256 void
emit_math(opcode opcode,dst_reg dst,src_reg src)257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
258 {
259    switch (opcode) {
260    case SHADER_OPCODE_RCP:
261    case SHADER_OPCODE_RSQ:
262    case SHADER_OPCODE_SQRT:
263    case SHADER_OPCODE_EXP2:
264    case SHADER_OPCODE_LOG2:
265    case SHADER_OPCODE_SIN:
266    case SHADER_OPCODE_COS:
267       break;
268    default:
269       assert(!"not reached: bad math opcode");
270       return;
271    }
272 
273    if (intel->gen >= 7) {
274       emit(opcode, dst, src);
275    } else if (intel->gen == 6) {
276       return emit_math1_gen6(opcode, dst, src);
277    } else {
278       return emit_math1_gen4(opcode, dst, src);
279    }
280 }
281 
282 void
emit_math2_gen6(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
284 			      dst_reg dst, src_reg src0, src_reg src1)
285 {
286    src_reg expanded;
287 
288    /* The gen6 math instruction ignores the source modifiers --
289     * swizzle, abs, negate, and at least some parts of the register
290     * region description.  Move the sources to temporaries to make it
291     * generally work.
292     */
293 
294    expanded = src_reg(this, glsl_type::vec4_type);
295    expanded.type = src0.type;
296    emit(MOV(dst_reg(expanded), src0));
297    src0 = expanded;
298 
299    expanded = src_reg(this, glsl_type::vec4_type);
300    expanded.type = src1.type;
301    emit(MOV(dst_reg(expanded), src1));
302    src1 = expanded;
303 
304    if (dst.writemask != WRITEMASK_XYZW) {
305       /* The gen6 math instruction must be align1, so we can't do
306        * writemasks.
307        */
308       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
309       temp_dst.type = dst.type;
310 
311       emit(opcode, temp_dst, src0, src1);
312 
313       emit(MOV(dst, src_reg(temp_dst)));
314    } else {
315       emit(opcode, dst, src0, src1);
316    }
317 }
318 
319 void
emit_math2_gen4(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
321 			      dst_reg dst, src_reg src0, src_reg src1)
322 {
323    vec4_instruction *inst = emit(opcode, dst, src0, src1);
324    inst->base_mrf = 1;
325    inst->mlen = 2;
326 }
327 
328 void
emit_math(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)329 vec4_visitor::emit_math(enum opcode opcode,
330 			dst_reg dst, src_reg src0, src_reg src1)
331 {
332    switch (opcode) {
333    case SHADER_OPCODE_POW:
334    case SHADER_OPCODE_INT_QUOTIENT:
335    case SHADER_OPCODE_INT_REMAINDER:
336       break;
337    default:
338       assert(!"not reached: unsupported binary math opcode");
339       return;
340    }
341 
342    if (intel->gen >= 7) {
343       emit(opcode, dst, src0, src1);
344    } else if (intel->gen == 6) {
345       return emit_math2_gen6(opcode, dst, src0, src1);
346    } else {
347       return emit_math2_gen4(opcode, dst, src0, src1);
348    }
349 }
350 
351 void
visit_instructions(const exec_list * list)352 vec4_visitor::visit_instructions(const exec_list *list)
353 {
354    foreach_list(node, list) {
355       ir_instruction *ir = (ir_instruction *)node;
356 
357       base_ir = ir;
358       ir->accept(this);
359    }
360 }
361 
362 
363 static int
type_size(const struct glsl_type * type)364 type_size(const struct glsl_type *type)
365 {
366    unsigned int i;
367    int size;
368 
369    switch (type->base_type) {
370    case GLSL_TYPE_UINT:
371    case GLSL_TYPE_INT:
372    case GLSL_TYPE_FLOAT:
373    case GLSL_TYPE_BOOL:
374       if (type->is_matrix()) {
375 	 return type->matrix_columns;
376       } else {
377 	 /* Regardless of size of vector, it gets a vec4. This is bad
378 	  * packing for things like floats, but otherwise arrays become a
379 	  * mess.  Hopefully a later pass over the code can pack scalars
380 	  * down if appropriate.
381 	  */
382 	 return 1;
383       }
384    case GLSL_TYPE_ARRAY:
385       assert(type->length > 0);
386       return type_size(type->fields.array) * type->length;
387    case GLSL_TYPE_STRUCT:
388       size = 0;
389       for (i = 0; i < type->length; i++) {
390 	 size += type_size(type->fields.structure[i].type);
391       }
392       return size;
393    case GLSL_TYPE_SAMPLER:
394       /* Samplers take up one slot in UNIFORMS[], but they're baked in
395        * at link time.
396        */
397       return 1;
398    default:
399       assert(0);
400       return 0;
401    }
402 }
403 
404 int
virtual_grf_alloc(int size)405 vec4_visitor::virtual_grf_alloc(int size)
406 {
407    if (virtual_grf_array_size <= virtual_grf_count) {
408       if (virtual_grf_array_size == 0)
409 	 virtual_grf_array_size = 16;
410       else
411 	 virtual_grf_array_size *= 2;
412       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
413 				   virtual_grf_array_size);
414       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
415 				     virtual_grf_array_size);
416    }
417    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
418    virtual_grf_reg_count += size;
419    virtual_grf_sizes[virtual_grf_count] = size;
420    return virtual_grf_count++;
421 }
422 
src_reg(class vec4_visitor * v,const struct glsl_type * type)423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
424 {
425    init();
426 
427    this->file = GRF;
428    this->reg = v->virtual_grf_alloc(type_size(type));
429 
430    if (type->is_array() || type->is_record()) {
431       this->swizzle = BRW_SWIZZLE_NOOP;
432    } else {
433       this->swizzle = swizzle_for_size(type->vector_elements);
434    }
435 
436    this->type = brw_type_for_base_type(type);
437 }
438 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
440 {
441    init();
442 
443    this->file = GRF;
444    this->reg = v->virtual_grf_alloc(type_size(type));
445 
446    if (type->is_array() || type->is_record()) {
447       this->writemask = WRITEMASK_XYZW;
448    } else {
449       this->writemask = (1 << type->vector_elements) - 1;
450    }
451 
452    this->type = brw_type_for_base_type(type);
453 }
454 
455 /* Our support for uniforms is piggy-backed on the struct
456  * gl_fragment_program, because that's where the values actually
457  * get stored, rather than in some global gl_shader_program uniform
458  * store.
459  */
460 int
setup_uniform_values(int loc,const glsl_type * type)461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
462 {
463    unsigned int offset = 0;
464    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
465 
466    if (type->is_matrix()) {
467       const glsl_type *column = type->column_type();
468 
469       for (unsigned int i = 0; i < type->matrix_columns; i++) {
470 	 offset += setup_uniform_values(loc + offset, column);
471       }
472 
473       return offset;
474    }
475 
476    switch (type->base_type) {
477    case GLSL_TYPE_FLOAT:
478    case GLSL_TYPE_UINT:
479    case GLSL_TYPE_INT:
480    case GLSL_TYPE_BOOL:
481       for (unsigned int i = 0; i < type->vector_elements; i++) {
482 	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
483       }
484 
485       /* Set up pad elements to get things aligned to a vec4 boundary. */
486       for (unsigned int i = type->vector_elements; i < 4; i++) {
487 	 static float zero = 0;
488 
489 	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
490       }
491 
492       /* Track the size of this uniform vector, for future packing of
493        * uniforms.
494        */
495       this->uniform_vector_size[this->uniforms] = type->vector_elements;
496       this->uniforms++;
497 
498       return 1;
499 
500    case GLSL_TYPE_STRUCT:
501       for (unsigned int i = 0; i < type->length; i++) {
502 	 offset += setup_uniform_values(loc + offset,
503 					type->fields.structure[i].type);
504       }
505       return offset;
506 
507    case GLSL_TYPE_ARRAY:
508       for (unsigned int i = 0; i < type->length; i++) {
509 	 offset += setup_uniform_values(loc + offset, type->fields.array);
510       }
511       return offset;
512 
513    case GLSL_TYPE_SAMPLER:
514       /* The sampler takes up a slot, but we don't use any values from it. */
515       return 1;
516 
517    default:
518       assert(!"not reached");
519       return 0;
520    }
521 }
522 
523 void
setup_uniform_clipplane_values()524 vec4_visitor::setup_uniform_clipplane_values()
525 {
526    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
527 
528    /* Pre-Gen6, we compact clip planes.  For example, if the user
529     * enables just clip planes 0, 1, and 3, we will enable clip planes
530     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
531     * plane 2.  This simplifies the implementation of the Gen6 clip
532     * thread.
533     *
534     * In Gen6 and later, we don't compact clip planes, because this
535     * simplifies the implementation of gl_ClipDistance.
536     */
537    int compacted_clipplane_index = 0;
538    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
539       if (intel->gen < 6 &&
540           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
541          continue;
542       }
543       this->uniform_vector_size[this->uniforms] = 4;
544       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
545       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
546       for (int j = 0; j < 4; ++j) {
547          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
548       }
549       ++compacted_clipplane_index;
550       ++this->uniforms;
551    }
552 }
553 
554 /* Our support for builtin uniforms is even scarier than non-builtin.
555  * It sits on top of the PROG_STATE_VAR parameters that are
556  * automatically updated from GL context state.
557  */
558 void
setup_builtin_uniform_values(ir_variable * ir)559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
560 {
561    const ir_state_slot *const slots = ir->state_slots;
562    assert(ir->state_slots != NULL);
563 
564    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
565       /* This state reference has already been setup by ir_to_mesa,
566        * but we'll get the same index back here.  We can reference
567        * ParameterValues directly, since unlike brw_fs.cpp, we never
568        * add new state references during compile.
569        */
570       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
571 					    (gl_state_index *)slots[i].tokens);
572       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
573 
574       this->uniform_vector_size[this->uniforms] = 0;
575       /* Add each of the unique swizzled channels of the element.
576        * This will end up matching the size of the glsl_type of this field.
577        */
578       int last_swiz = -1;
579       for (unsigned int j = 0; j < 4; j++) {
580 	 int swiz = GET_SWZ(slots[i].swizzle, j);
581 	 last_swiz = swiz;
582 
583 	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
584 	 if (swiz <= last_swiz)
585 	    this->uniform_vector_size[this->uniforms]++;
586       }
587       this->uniforms++;
588    }
589 }
590 
591 dst_reg *
variable_storage(ir_variable * var)592 vec4_visitor::variable_storage(ir_variable *var)
593 {
594    return (dst_reg *)hash_table_find(this->variable_ht, var);
595 }
596 
597 void
emit_bool_to_cond_code(ir_rvalue * ir,uint32_t * predicate)598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
599 {
600    ir_expression *expr = ir->as_expression();
601 
602    *predicate = BRW_PREDICATE_NORMAL;
603 
604    if (expr) {
605       src_reg op[2];
606       vec4_instruction *inst;
607 
608       assert(expr->get_num_operands() <= 2);
609       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
610 	 expr->operands[i]->accept(this);
611 	 op[i] = this->result;
612 
613 	 resolve_ud_negate(&op[i]);
614       }
615 
616       switch (expr->operation) {
617       case ir_unop_logic_not:
618 	 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
619 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
620 	 break;
621 
622       case ir_binop_logic_xor:
623 	 inst = emit(XOR(dst_null_d(), op[0], op[1]));
624 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
625 	 break;
626 
627       case ir_binop_logic_or:
628 	 inst = emit(OR(dst_null_d(), op[0], op[1]));
629 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
630 	 break;
631 
632       case ir_binop_logic_and:
633 	 inst = emit(AND(dst_null_d(), op[0], op[1]));
634 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
635 	 break;
636 
637       case ir_unop_f2b:
638 	 if (intel->gen >= 6) {
639 	    emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
640 	 } else {
641 	    inst = emit(MOV(dst_null_f(), op[0]));
642 	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
643 	 }
644 	 break;
645 
646       case ir_unop_i2b:
647 	 if (intel->gen >= 6) {
648 	    emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
649 	 } else {
650 	    inst = emit(MOV(dst_null_d(), op[0]));
651 	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
652 	 }
653 	 break;
654 
655       case ir_binop_all_equal:
656 	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
657 	 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
658 	 break;
659 
660       case ir_binop_any_nequal:
661 	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
662 	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
663 	 break;
664 
665       case ir_unop_any:
666 	 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
667 	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
668 	 break;
669 
670       case ir_binop_greater:
671       case ir_binop_gequal:
672       case ir_binop_less:
673       case ir_binop_lequal:
674       case ir_binop_equal:
675       case ir_binop_nequal:
676 	 emit(CMP(dst_null_d(), op[0], op[1],
677 		  brw_conditional_for_comparison(expr->operation)));
678 	 break;
679 
680       default:
681 	 assert(!"not reached");
682 	 break;
683       }
684       return;
685    }
686 
687    ir->accept(this);
688 
689    resolve_ud_negate(&this->result);
690 
691    if (intel->gen >= 6) {
692       vec4_instruction *inst = emit(AND(dst_null_d(),
693 					this->result, src_reg(1)));
694       inst->conditional_mod = BRW_CONDITIONAL_NZ;
695    } else {
696       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
697       inst->conditional_mod = BRW_CONDITIONAL_NZ;
698    }
699 }
700 
701 /**
702  * Emit a gen6 IF statement with the comparison folded into the IF
703  * instruction.
704  */
705 void
emit_if_gen6(ir_if * ir)706 vec4_visitor::emit_if_gen6(ir_if *ir)
707 {
708    ir_expression *expr = ir->condition->as_expression();
709 
710    if (expr) {
711       src_reg op[2];
712       dst_reg temp;
713 
714       assert(expr->get_num_operands() <= 2);
715       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
716 	 expr->operands[i]->accept(this);
717 	 op[i] = this->result;
718       }
719 
720       switch (expr->operation) {
721       case ir_unop_logic_not:
722 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
723 	 return;
724 
725       case ir_binop_logic_xor:
726 	 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
727 	 return;
728 
729       case ir_binop_logic_or:
730 	 temp = dst_reg(this, glsl_type::bool_type);
731 	 emit(OR(temp, op[0], op[1]));
732 	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
733 	 return;
734 
735       case ir_binop_logic_and:
736 	 temp = dst_reg(this, glsl_type::bool_type);
737 	 emit(AND(temp, op[0], op[1]));
738 	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
739 	 return;
740 
741       case ir_unop_f2b:
742 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
743 	 return;
744 
745       case ir_unop_i2b:
746 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
747 	 return;
748 
749       case ir_binop_greater:
750       case ir_binop_gequal:
751       case ir_binop_less:
752       case ir_binop_lequal:
753       case ir_binop_equal:
754       case ir_binop_nequal:
755 	 emit(IF(op[0], op[1],
756 		 brw_conditional_for_comparison(expr->operation)));
757 	 return;
758 
759       case ir_binop_all_equal:
760 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
761 	 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
762 	 return;
763 
764       case ir_binop_any_nequal:
765 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
766 	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
767 	 return;
768 
769       case ir_unop_any:
770 	 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
771 	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
772 	 return;
773 
774       default:
775 	 assert(!"not reached");
776 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
777 	 return;
778       }
779       return;
780    }
781 
782    ir->condition->accept(this);
783 
784    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
785 }
786 
787 void
visit(ir_variable * ir)788 vec4_visitor::visit(ir_variable *ir)
789 {
790    dst_reg *reg = NULL;
791 
792    if (variable_storage(ir))
793       return;
794 
795    switch (ir->mode) {
796    case ir_var_in:
797       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
798 
799       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
800        * come in as floating point conversions of the integer values.
801        */
802       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
803 	 if (!c->key.gl_fixed_input_size[i])
804 	    continue;
805 
806 	 dst_reg dst = *reg;
807          dst.type = brw_type_for_base_type(ir->type);
808 	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
809 	 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
810       }
811       break;
812 
813    case ir_var_out:
814       reg = new(mem_ctx) dst_reg(this, ir->type);
815 
816       for (int i = 0; i < type_size(ir->type); i++) {
817 	 output_reg[ir->location + i] = *reg;
818 	 output_reg[ir->location + i].reg_offset = i;
819 	 output_reg[ir->location + i].type =
820             brw_type_for_base_type(ir->type->get_scalar_type());
821 	 output_reg_annotation[ir->location + i] = ir->name;
822       }
823       break;
824 
825    case ir_var_auto:
826    case ir_var_temporary:
827       reg = new(mem_ctx) dst_reg(this, ir->type);
828       break;
829 
830    case ir_var_uniform:
831       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
832 
833       /* Thanks to the lower_ubo_reference pass, we will see only
834        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
835        * variables, so no need for them to be in variable_ht.
836        */
837       if (ir->uniform_block != -1)
838          return;
839 
840       /* Track how big the whole uniform variable is, in case we need to put a
841        * copy of its data into pull constants for array access.
842        */
843       this->uniform_size[this->uniforms] = type_size(ir->type);
844 
845       if (!strncmp(ir->name, "gl_", 3)) {
846 	 setup_builtin_uniform_values(ir);
847       } else {
848 	 setup_uniform_values(ir->location, ir->type);
849       }
850       break;
851 
852    case ir_var_system_value:
853       /* VertexID is stored by the VF as the last vertex element, but
854        * we don't represent it with a flag in inputs_read, so we call
855        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
856        */
857       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
858       prog_data->uses_vertexid = true;
859 
860       switch (ir->location) {
861       case SYSTEM_VALUE_VERTEX_ID:
862 	 reg->writemask = WRITEMASK_X;
863 	 break;
864       case SYSTEM_VALUE_INSTANCE_ID:
865 	 reg->writemask = WRITEMASK_Y;
866 	 break;
867       default:
868 	 assert(!"not reached");
869 	 break;
870       }
871       break;
872 
873    default:
874       assert(!"not reached");
875    }
876 
877    reg->type = brw_type_for_base_type(ir->type);
878    hash_table_insert(this->variable_ht, reg, ir);
879 }
880 
881 void
visit(ir_loop * ir)882 vec4_visitor::visit(ir_loop *ir)
883 {
884    dst_reg counter;
885 
886    /* We don't want debugging output to print the whole body of the
887     * loop as the annotation.
888     */
889    this->base_ir = NULL;
890 
891    if (ir->counter != NULL) {
892       this->base_ir = ir->counter;
893       ir->counter->accept(this);
894       counter = *(variable_storage(ir->counter));
895 
896       if (ir->from != NULL) {
897 	 this->base_ir = ir->from;
898 	 ir->from->accept(this);
899 
900 	 emit(MOV(counter, this->result));
901       }
902    }
903 
904    emit(BRW_OPCODE_DO);
905 
906    if (ir->to) {
907       this->base_ir = ir->to;
908       ir->to->accept(this);
909 
910       emit(CMP(dst_null_d(), src_reg(counter), this->result,
911 	       brw_conditional_for_comparison(ir->cmp)));
912 
913       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
914       inst->predicate = BRW_PREDICATE_NORMAL;
915    }
916 
917    visit_instructions(&ir->body_instructions);
918 
919 
920    if (ir->increment) {
921       this->base_ir = ir->increment;
922       ir->increment->accept(this);
923       emit(ADD(counter, src_reg(counter), this->result));
924    }
925 
926    emit(BRW_OPCODE_WHILE);
927 }
928 
929 void
visit(ir_loop_jump * ir)930 vec4_visitor::visit(ir_loop_jump *ir)
931 {
932    switch (ir->mode) {
933    case ir_loop_jump::jump_break:
934       emit(BRW_OPCODE_BREAK);
935       break;
936    case ir_loop_jump::jump_continue:
937       emit(BRW_OPCODE_CONTINUE);
938       break;
939    }
940 }
941 
942 
943 void
visit(ir_function_signature * ir)944 vec4_visitor::visit(ir_function_signature *ir)
945 {
946    assert(0);
947    (void)ir;
948 }
949 
950 void
visit(ir_function * ir)951 vec4_visitor::visit(ir_function *ir)
952 {
953    /* Ignore function bodies other than main() -- we shouldn't see calls to
954     * them since they should all be inlined.
955     */
956    if (strcmp(ir->name, "main") == 0) {
957       const ir_function_signature *sig;
958       exec_list empty;
959 
960       sig = ir->matching_signature(&empty);
961 
962       assert(sig);
963 
964       visit_instructions(&sig->body);
965    }
966 }
967 
968 bool
try_emit_sat(ir_expression * ir)969 vec4_visitor::try_emit_sat(ir_expression *ir)
970 {
971    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
972    if (!sat_src)
973       return false;
974 
975    sat_src->accept(this);
976    src_reg src = this->result;
977 
978    this->result = src_reg(this, ir->type);
979    vec4_instruction *inst;
980    inst = emit(MOV(dst_reg(this->result), src));
981    inst->saturate = true;
982 
983    return true;
984 }
985 
986 void
emit_bool_comparison(unsigned int op,dst_reg dst,src_reg src0,src_reg src1)987 vec4_visitor::emit_bool_comparison(unsigned int op,
988 				 dst_reg dst, src_reg src0, src_reg src1)
989 {
990    /* original gen4 does destination conversion before comparison. */
991    if (intel->gen < 5)
992       dst.type = src0.type;
993 
994    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
995 
996    dst.type = BRW_REGISTER_TYPE_D;
997    emit(AND(dst, src_reg(dst), src_reg(0x1)));
998 }
999 
1000 void
visit(ir_expression * ir)1001 vec4_visitor::visit(ir_expression *ir)
1002 {
1003    unsigned int operand;
1004    src_reg op[Elements(ir->operands)];
1005    src_reg result_src;
1006    dst_reg result_dst;
1007    vec4_instruction *inst;
1008 
1009    if (try_emit_sat(ir))
1010       return;
1011 
1012    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1013       this->result.file = BAD_FILE;
1014       ir->operands[operand]->accept(this);
1015       if (this->result.file == BAD_FILE) {
1016 	 printf("Failed to get tree for expression operand:\n");
1017 	 ir->operands[operand]->print();
1018 	 exit(1);
1019       }
1020       op[operand] = this->result;
1021 
1022       /* Matrix expression operands should have been broken down to vector
1023        * operations already.
1024        */
1025       assert(!ir->operands[operand]->type->is_matrix());
1026    }
1027 
1028    int vector_elements = ir->operands[0]->type->vector_elements;
1029    if (ir->operands[1]) {
1030       vector_elements = MAX2(vector_elements,
1031 			     ir->operands[1]->type->vector_elements);
1032    }
1033 
1034    this->result.file = BAD_FILE;
1035 
1036    /* Storage for our result.  Ideally for an assignment we'd be using
1037     * the actual storage for the result here, instead.
1038     */
1039    result_src = src_reg(this, ir->type);
1040    /* convenience for the emit functions below. */
1041    result_dst = dst_reg(result_src);
1042    /* If nothing special happens, this is the result. */
1043    this->result = result_src;
1044    /* Limit writes to the channels that will be used by result_src later.
1045     * This does limit this temp's use as a temporary for multi-instruction
1046     * sequences.
1047     */
1048    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1049 
1050    switch (ir->operation) {
1051    case ir_unop_logic_not:
1052       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1053        * ones complement of the whole register, not just bit 0.
1054        */
1055       emit(XOR(result_dst, op[0], src_reg(1)));
1056       break;
1057    case ir_unop_neg:
1058       op[0].negate = !op[0].negate;
1059       this->result = op[0];
1060       break;
1061    case ir_unop_abs:
1062       op[0].abs = true;
1063       op[0].negate = false;
1064       this->result = op[0];
1065       break;
1066 
1067    case ir_unop_sign:
1068       emit(MOV(result_dst, src_reg(0.0f)));
1069 
1070       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1071       inst = emit(MOV(result_dst, src_reg(1.0f)));
1072       inst->predicate = BRW_PREDICATE_NORMAL;
1073 
1074       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1075       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1076       inst->predicate = BRW_PREDICATE_NORMAL;
1077 
1078       break;
1079 
1080    case ir_unop_rcp:
1081       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1082       break;
1083 
1084    case ir_unop_exp2:
1085       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1086       break;
1087    case ir_unop_log2:
1088       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1089       break;
1090    case ir_unop_exp:
1091    case ir_unop_log:
1092       assert(!"not reached: should be handled by ir_explog_to_explog2");
1093       break;
1094    case ir_unop_sin:
1095    case ir_unop_sin_reduced:
1096       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1097       break;
1098    case ir_unop_cos:
1099    case ir_unop_cos_reduced:
1100       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1101       break;
1102 
1103    case ir_unop_dFdx:
1104    case ir_unop_dFdy:
1105       assert(!"derivatives not valid in vertex shader");
1106       break;
1107 
1108    case ir_unop_noise:
1109       assert(!"not reached: should be handled by lower_noise");
1110       break;
1111 
1112    case ir_binop_add:
1113       emit(ADD(result_dst, op[0], op[1]));
1114       break;
1115    case ir_binop_sub:
1116       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1117       break;
1118 
1119    case ir_binop_mul:
1120       if (ir->type->is_integer()) {
1121 	 /* For integer multiplication, the MUL uses the low 16 bits
1122 	  * of one of the operands (src0 on gen6, src1 on gen7).  The
1123 	  * MACH accumulates in the contribution of the upper 16 bits
1124 	  * of that operand.
1125 	  *
1126 	  * FINISHME: Emit just the MUL if we know an operand is small
1127 	  * enough.
1128 	  */
1129 	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1130 
1131 	 emit(MUL(acc, op[0], op[1]));
1132 	 emit(MACH(dst_null_d(), op[0], op[1]));
1133 	 emit(MOV(result_dst, src_reg(acc)));
1134       } else {
1135 	 emit(MUL(result_dst, op[0], op[1]));
1136       }
1137       break;
1138    case ir_binop_div:
1139       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1140       assert(ir->type->is_integer());
1141       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1142       break;
1143    case ir_binop_mod:
1144       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1145       assert(ir->type->is_integer());
1146       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1147       break;
1148 
1149    case ir_binop_less:
1150    case ir_binop_greater:
1151    case ir_binop_lequal:
1152    case ir_binop_gequal:
1153    case ir_binop_equal:
1154    case ir_binop_nequal: {
1155       emit(CMP(result_dst, op[0], op[1],
1156 	       brw_conditional_for_comparison(ir->operation)));
1157       emit(AND(result_dst, result_src, src_reg(0x1)));
1158       break;
1159    }
1160 
1161    case ir_binop_all_equal:
1162       /* "==" operator producing a scalar boolean. */
1163       if (ir->operands[0]->type->is_vector() ||
1164 	  ir->operands[1]->type->is_vector()) {
1165 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1166 	 emit(MOV(result_dst, src_reg(0)));
1167 	 inst = emit(MOV(result_dst, src_reg(1)));
1168 	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1169       } else {
1170 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1171 	 emit(AND(result_dst, result_src, src_reg(0x1)));
1172       }
1173       break;
1174    case ir_binop_any_nequal:
1175       /* "!=" operator producing a scalar boolean. */
1176       if (ir->operands[0]->type->is_vector() ||
1177 	  ir->operands[1]->type->is_vector()) {
1178 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1179 
1180 	 emit(MOV(result_dst, src_reg(0)));
1181 	 inst = emit(MOV(result_dst, src_reg(1)));
1182 	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1183       } else {
1184 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1185 	 emit(AND(result_dst, result_src, src_reg(0x1)));
1186       }
1187       break;
1188 
1189    case ir_unop_any:
1190       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1191       emit(MOV(result_dst, src_reg(0)));
1192 
1193       inst = emit(MOV(result_dst, src_reg(1)));
1194       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1195       break;
1196 
1197    case ir_binop_logic_xor:
1198       emit(XOR(result_dst, op[0], op[1]));
1199       break;
1200 
1201    case ir_binop_logic_or:
1202       emit(OR(result_dst, op[0], op[1]));
1203       break;
1204 
1205    case ir_binop_logic_and:
1206       emit(AND(result_dst, op[0], op[1]));
1207       break;
1208 
1209    case ir_binop_dot:
1210       assert(ir->operands[0]->type->is_vector());
1211       assert(ir->operands[0]->type == ir->operands[1]->type);
1212       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1213       break;
1214 
1215    case ir_unop_sqrt:
1216       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1217       break;
1218    case ir_unop_rsq:
1219       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1220       break;
1221 
1222    case ir_unop_bitcast_i2f:
1223    case ir_unop_bitcast_u2f:
1224       this->result = op[0];
1225       this->result.type = BRW_REGISTER_TYPE_F;
1226       break;
1227 
1228    case ir_unop_bitcast_f2i:
1229       this->result = op[0];
1230       this->result.type = BRW_REGISTER_TYPE_D;
1231       break;
1232 
1233    case ir_unop_bitcast_f2u:
1234       this->result = op[0];
1235       this->result.type = BRW_REGISTER_TYPE_UD;
1236       break;
1237 
1238    case ir_unop_i2f:
1239    case ir_unop_i2u:
1240    case ir_unop_u2i:
1241    case ir_unop_u2f:
1242    case ir_unop_b2f:
1243    case ir_unop_b2i:
1244    case ir_unop_f2i:
1245    case ir_unop_f2u:
1246       emit(MOV(result_dst, op[0]));
1247       break;
1248    case ir_unop_f2b:
1249    case ir_unop_i2b: {
1250       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1251       emit(AND(result_dst, result_src, src_reg(1)));
1252       break;
1253    }
1254 
1255    case ir_unop_trunc:
1256       emit(RNDZ(result_dst, op[0]));
1257       break;
1258    case ir_unop_ceil:
1259       op[0].negate = !op[0].negate;
1260       inst = emit(RNDD(result_dst, op[0]));
1261       this->result.negate = true;
1262       break;
1263    case ir_unop_floor:
1264       inst = emit(RNDD(result_dst, op[0]));
1265       break;
1266    case ir_unop_fract:
1267       inst = emit(FRC(result_dst, op[0]));
1268       break;
1269    case ir_unop_round_even:
1270       emit(RNDE(result_dst, op[0]));
1271       break;
1272 
1273    case ir_binop_min:
1274       if (intel->gen >= 6) {
1275 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1276 	 inst->conditional_mod = BRW_CONDITIONAL_L;
1277       } else {
1278 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1279 
1280 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1281 	 inst->predicate = BRW_PREDICATE_NORMAL;
1282       }
1283       break;
1284    case ir_binop_max:
1285       if (intel->gen >= 6) {
1286 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1287 	 inst->conditional_mod = BRW_CONDITIONAL_G;
1288       } else {
1289 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1290 
1291 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1292 	 inst->predicate = BRW_PREDICATE_NORMAL;
1293       }
1294       break;
1295 
1296    case ir_binop_pow:
1297       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1298       break;
1299 
1300    case ir_unop_bit_not:
1301       inst = emit(NOT(result_dst, op[0]));
1302       break;
1303    case ir_binop_bit_and:
1304       inst = emit(AND(result_dst, op[0], op[1]));
1305       break;
1306    case ir_binop_bit_xor:
1307       inst = emit(XOR(result_dst, op[0], op[1]));
1308       break;
1309    case ir_binop_bit_or:
1310       inst = emit(OR(result_dst, op[0], op[1]));
1311       break;
1312 
1313    case ir_binop_lshift:
1314       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1315       break;
1316 
1317    case ir_binop_rshift:
1318       if (ir->type->base_type == GLSL_TYPE_INT)
1319 	 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1320       else
1321 	 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1322       break;
1323 
1324    case ir_binop_ubo_load: {
1325       ir_constant *uniform_block = ir->operands[0]->as_constant();
1326       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1327       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1328       src_reg offset = op[1];
1329 
1330       /* Now, load the vector from that offset. */
1331       assert(ir->type->is_vector() || ir->type->is_scalar());
1332 
1333       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1334       packed_consts.type = result.type;
1335       src_reg surf_index =
1336          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1337       if (const_offset_ir) {
1338          offset = src_reg(const_offset / 16);
1339       } else {
1340          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1341       }
1342 
1343       vec4_instruction *pull =
1344          emit(new(mem_ctx) vec4_instruction(this,
1345                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1346                                             dst_reg(packed_consts),
1347                                             surf_index,
1348                                             offset));
1349       pull->base_mrf = 14;
1350       pull->mlen = 1;
1351 
1352       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1353       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1354                                             const_offset % 16 / 4,
1355                                             const_offset % 16 / 4,
1356                                             const_offset % 16 / 4);
1357 
1358       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1359       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1360          emit(CMP(result_dst, packed_consts, src_reg(0u),
1361                   BRW_CONDITIONAL_NZ));
1362          emit(AND(result_dst, result, src_reg(0x1)));
1363       } else {
1364          emit(MOV(result_dst, packed_consts));
1365       }
1366       break;
1367    }
1368 
1369    case ir_quadop_vector:
1370       assert(!"not reached: should be handled by lower_quadop_vector");
1371       break;
1372    }
1373 }
1374 
1375 
1376 void
visit(ir_swizzle * ir)1377 vec4_visitor::visit(ir_swizzle *ir)
1378 {
1379    src_reg src;
1380    int i = 0;
1381    int swizzle[4];
1382 
1383    /* Note that this is only swizzles in expressions, not those on the left
1384     * hand side of an assignment, which do write masking.  See ir_assignment
1385     * for that.
1386     */
1387 
1388    ir->val->accept(this);
1389    src = this->result;
1390    assert(src.file != BAD_FILE);
1391 
1392    for (i = 0; i < ir->type->vector_elements; i++) {
1393       switch (i) {
1394       case 0:
1395 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1396 	 break;
1397       case 1:
1398 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1399 	 break;
1400       case 2:
1401 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1402 	 break;
1403       case 3:
1404 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1405 	    break;
1406       }
1407    }
1408    for (; i < 4; i++) {
1409       /* Replicate the last channel out. */
1410       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1411    }
1412 
1413    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1414 
1415    this->result = src;
1416 }
1417 
1418 void
visit(ir_dereference_variable * ir)1419 vec4_visitor::visit(ir_dereference_variable *ir)
1420 {
1421    const struct glsl_type *type = ir->type;
1422    dst_reg *reg = variable_storage(ir->var);
1423 
1424    if (!reg) {
1425       fail("Failed to find variable storage for %s\n", ir->var->name);
1426       this->result = src_reg(brw_null_reg());
1427       return;
1428    }
1429 
1430    this->result = src_reg(*reg);
1431 
1432    /* System values get their swizzle from the dst_reg writemask */
1433    if (ir->var->mode == ir_var_system_value)
1434       return;
1435 
1436    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1437       this->result.swizzle = swizzle_for_size(type->vector_elements);
1438 }
1439 
1440 void
visit(ir_dereference_array * ir)1441 vec4_visitor::visit(ir_dereference_array *ir)
1442 {
1443    ir_constant *constant_index;
1444    src_reg src;
1445    int element_size = type_size(ir->type);
1446 
1447    constant_index = ir->array_index->constant_expression_value();
1448 
1449    ir->array->accept(this);
1450    src = this->result;
1451 
1452    if (constant_index) {
1453       src.reg_offset += constant_index->value.i[0] * element_size;
1454    } else {
1455       /* Variable index array dereference.  It eats the "vec4" of the
1456        * base of the array and an index that offsets the Mesa register
1457        * index.
1458        */
1459       ir->array_index->accept(this);
1460 
1461       src_reg index_reg;
1462 
1463       if (element_size == 1) {
1464 	 index_reg = this->result;
1465       } else {
1466 	 index_reg = src_reg(this, glsl_type::int_type);
1467 
1468 	 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1469       }
1470 
1471       if (src.reladdr) {
1472 	 src_reg temp = src_reg(this, glsl_type::int_type);
1473 
1474 	 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1475 
1476 	 index_reg = temp;
1477       }
1478 
1479       src.reladdr = ralloc(mem_ctx, src_reg);
1480       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1481    }
1482 
1483    /* If the type is smaller than a vec4, replicate the last channel out. */
1484    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1485       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1486    else
1487       src.swizzle = BRW_SWIZZLE_NOOP;
1488    src.type = brw_type_for_base_type(ir->type);
1489 
1490    this->result = src;
1491 }
1492 
1493 void
visit(ir_dereference_record * ir)1494 vec4_visitor::visit(ir_dereference_record *ir)
1495 {
1496    unsigned int i;
1497    const glsl_type *struct_type = ir->record->type;
1498    int offset = 0;
1499 
1500    ir->record->accept(this);
1501 
1502    for (i = 0; i < struct_type->length; i++) {
1503       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1504 	 break;
1505       offset += type_size(struct_type->fields.structure[i].type);
1506    }
1507 
1508    /* If the type is smaller than a vec4, replicate the last channel out. */
1509    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1510       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1511    else
1512       this->result.swizzle = BRW_SWIZZLE_NOOP;
1513    this->result.type = brw_type_for_base_type(ir->type);
1514 
1515    this->result.reg_offset += offset;
1516 }
1517 
1518 /**
1519  * We want to be careful in assignment setup to hit the actual storage
1520  * instead of potentially using a temporary like we might with the
1521  * ir_dereference handler.
1522  */
1523 static dst_reg
get_assignment_lhs(ir_dereference * ir,vec4_visitor * v)1524 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1525 {
1526    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1527     * access of a vector, it must be separated into a series conditional moves
1528     * before reaching this point (see ir_vec_index_to_cond_assign).
1529     */
1530    assert(ir->as_dereference());
1531    ir_dereference_array *deref_array = ir->as_dereference_array();
1532    if (deref_array) {
1533       assert(!deref_array->array->type->is_vector());
1534    }
1535 
1536    /* Use the rvalue deref handler for the most part.  We'll ignore
1537     * swizzles in it and write swizzles using writemask, though.
1538     */
1539    ir->accept(v);
1540    return dst_reg(v->result);
1541 }
1542 
1543 void
emit_block_move(dst_reg * dst,src_reg * src,const struct glsl_type * type,uint32_t predicate)1544 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1545 			      const struct glsl_type *type, uint32_t predicate)
1546 {
1547    if (type->base_type == GLSL_TYPE_STRUCT) {
1548       for (unsigned int i = 0; i < type->length; i++) {
1549 	 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1550       }
1551       return;
1552    }
1553 
1554    if (type->is_array()) {
1555       for (unsigned int i = 0; i < type->length; i++) {
1556 	 emit_block_move(dst, src, type->fields.array, predicate);
1557       }
1558       return;
1559    }
1560 
1561    if (type->is_matrix()) {
1562       const struct glsl_type *vec_type;
1563 
1564       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1565 					 type->vector_elements, 1);
1566 
1567       for (int i = 0; i < type->matrix_columns; i++) {
1568 	 emit_block_move(dst, src, vec_type, predicate);
1569       }
1570       return;
1571    }
1572 
1573    assert(type->is_scalar() || type->is_vector());
1574 
1575    dst->type = brw_type_for_base_type(type);
1576    src->type = dst->type;
1577 
1578    dst->writemask = (1 << type->vector_elements) - 1;
1579 
1580    src->swizzle = swizzle_for_size(type->vector_elements);
1581 
1582    vec4_instruction *inst = emit(MOV(*dst, *src));
1583    inst->predicate = predicate;
1584 
1585    dst->reg_offset++;
1586    src->reg_offset++;
1587 }
1588 
1589 
1590 /* If the RHS processing resulted in an instruction generating a
1591  * temporary value, and it would be easy to rewrite the instruction to
1592  * generate its result right into the LHS instead, do so.  This ends
1593  * up reliably removing instructions where it can be tricky to do so
1594  * later without real UD chain information.
1595  */
1596 bool
try_rewrite_rhs_to_dst(ir_assignment * ir,dst_reg dst,src_reg src,vec4_instruction * pre_rhs_inst,vec4_instruction * last_rhs_inst)1597 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1598 				     dst_reg dst,
1599 				     src_reg src,
1600 				     vec4_instruction *pre_rhs_inst,
1601 				     vec4_instruction *last_rhs_inst)
1602 {
1603    /* This could be supported, but it would take more smarts. */
1604    if (ir->condition)
1605       return false;
1606 
1607    if (pre_rhs_inst == last_rhs_inst)
1608       return false; /* No instructions generated to work with. */
1609 
1610    /* Make sure the last instruction generated our source reg. */
1611    if (src.file != GRF ||
1612        src.file != last_rhs_inst->dst.file ||
1613        src.reg != last_rhs_inst->dst.reg ||
1614        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1615        src.reladdr ||
1616        src.abs ||
1617        src.negate ||
1618        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1619       return false;
1620 
1621    /* Check that that last instruction fully initialized the channels
1622     * we want to use, in the order we want to use them.  We could
1623     * potentially reswizzle the operands of many instructions so that
1624     * we could handle out of order channels, but don't yet.
1625     */
1626 
1627    for (unsigned i = 0; i < 4; i++) {
1628       if (dst.writemask & (1 << i)) {
1629 	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1630 	    return false;
1631 
1632 	 if (BRW_GET_SWZ(src.swizzle, i) != i)
1633 	    return false;
1634       }
1635    }
1636 
1637    /* Success!  Rewrite the instruction. */
1638    last_rhs_inst->dst.file = dst.file;
1639    last_rhs_inst->dst.reg = dst.reg;
1640    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1641    last_rhs_inst->dst.reladdr = dst.reladdr;
1642    last_rhs_inst->dst.writemask &= dst.writemask;
1643 
1644    return true;
1645 }
1646 
1647 void
visit(ir_assignment * ir)1648 vec4_visitor::visit(ir_assignment *ir)
1649 {
1650    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1651    uint32_t predicate = BRW_PREDICATE_NONE;
1652 
1653    if (!ir->lhs->type->is_scalar() &&
1654        !ir->lhs->type->is_vector()) {
1655       ir->rhs->accept(this);
1656       src_reg src = this->result;
1657 
1658       if (ir->condition) {
1659 	 emit_bool_to_cond_code(ir->condition, &predicate);
1660       }
1661 
1662       /* emit_block_move doesn't account for swizzles in the source register.
1663        * This should be ok, since the source register is a structure or an
1664        * array, and those can't be swizzled.  But double-check to be sure.
1665        */
1666       assert(src.swizzle ==
1667              (ir->rhs->type->is_matrix()
1668               ? swizzle_for_size(ir->rhs->type->vector_elements)
1669               : BRW_SWIZZLE_NOOP));
1670 
1671       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1672       return;
1673    }
1674 
1675    /* Now we're down to just a scalar/vector with writemasks. */
1676    int i;
1677 
1678    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1679    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1680 
1681    ir->rhs->accept(this);
1682 
1683    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1684 
1685    src_reg src = this->result;
1686 
1687    int swizzles[4];
1688    int first_enabled_chan = 0;
1689    int src_chan = 0;
1690 
1691    assert(ir->lhs->type->is_vector() ||
1692 	  ir->lhs->type->is_scalar());
1693    dst.writemask = ir->write_mask;
1694 
1695    for (int i = 0; i < 4; i++) {
1696       if (dst.writemask & (1 << i)) {
1697 	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1698 	 break;
1699       }
1700    }
1701 
1702    /* Swizzle a small RHS vector into the channels being written.
1703     *
1704     * glsl ir treats write_mask as dictating how many channels are
1705     * present on the RHS while in our instructions we need to make
1706     * those channels appear in the slots of the vec4 they're written to.
1707     */
1708    for (int i = 0; i < 4; i++) {
1709       if (dst.writemask & (1 << i))
1710 	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1711       else
1712 	 swizzles[i] = first_enabled_chan;
1713    }
1714    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1715 			      swizzles[2], swizzles[3]);
1716 
1717    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1718       return;
1719    }
1720 
1721    if (ir->condition) {
1722       emit_bool_to_cond_code(ir->condition, &predicate);
1723    }
1724 
1725    for (i = 0; i < type_size(ir->lhs->type); i++) {
1726       vec4_instruction *inst = emit(MOV(dst, src));
1727       inst->predicate = predicate;
1728 
1729       dst.reg_offset++;
1730       src.reg_offset++;
1731    }
1732 }
1733 
1734 void
emit_constant_values(dst_reg * dst,ir_constant * ir)1735 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1736 {
1737    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1738       foreach_list(node, &ir->components) {
1739 	 ir_constant *field_value = (ir_constant *)node;
1740 
1741 	 emit_constant_values(dst, field_value);
1742       }
1743       return;
1744    }
1745 
1746    if (ir->type->is_array()) {
1747       for (unsigned int i = 0; i < ir->type->length; i++) {
1748 	 emit_constant_values(dst, ir->array_elements[i]);
1749       }
1750       return;
1751    }
1752 
1753    if (ir->type->is_matrix()) {
1754       for (int i = 0; i < ir->type->matrix_columns; i++) {
1755 	 float *vec = &ir->value.f[i * ir->type->vector_elements];
1756 
1757 	 for (int j = 0; j < ir->type->vector_elements; j++) {
1758 	    dst->writemask = 1 << j;
1759 	    dst->type = BRW_REGISTER_TYPE_F;
1760 
1761 	    emit(MOV(*dst, src_reg(vec[j])));
1762 	 }
1763 	 dst->reg_offset++;
1764       }
1765       return;
1766    }
1767 
1768    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1769 
1770    for (int i = 0; i < ir->type->vector_elements; i++) {
1771       if (!(remaining_writemask & (1 << i)))
1772 	 continue;
1773 
1774       dst->writemask = 1 << i;
1775       dst->type = brw_type_for_base_type(ir->type);
1776 
1777       /* Find other components that match the one we're about to
1778        * write.  Emits fewer instructions for things like vec4(0.5,
1779        * 1.5, 1.5, 1.5).
1780        */
1781       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1782 	 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1783 	    if (ir->value.b[i] == ir->value.b[j])
1784 	       dst->writemask |= (1 << j);
1785 	 } else {
1786 	    /* u, i, and f storage all line up, so no need for a
1787 	     * switch case for comparing each type.
1788 	     */
1789 	    if (ir->value.u[i] == ir->value.u[j])
1790 	       dst->writemask |= (1 << j);
1791 	 }
1792       }
1793 
1794       switch (ir->type->base_type) {
1795       case GLSL_TYPE_FLOAT:
1796 	 emit(MOV(*dst, src_reg(ir->value.f[i])));
1797 	 break;
1798       case GLSL_TYPE_INT:
1799 	 emit(MOV(*dst, src_reg(ir->value.i[i])));
1800 	 break;
1801       case GLSL_TYPE_UINT:
1802 	 emit(MOV(*dst, src_reg(ir->value.u[i])));
1803 	 break;
1804       case GLSL_TYPE_BOOL:
1805 	 emit(MOV(*dst, src_reg(ir->value.b[i])));
1806 	 break;
1807       default:
1808 	 assert(!"Non-float/uint/int/bool constant");
1809 	 break;
1810       }
1811 
1812       remaining_writemask &= ~dst->writemask;
1813    }
1814    dst->reg_offset++;
1815 }
1816 
1817 void
visit(ir_constant * ir)1818 vec4_visitor::visit(ir_constant *ir)
1819 {
1820    dst_reg dst = dst_reg(this, ir->type);
1821    this->result = src_reg(dst);
1822 
1823    emit_constant_values(&dst, ir);
1824 }
1825 
1826 void
visit(ir_call * ir)1827 vec4_visitor::visit(ir_call *ir)
1828 {
1829    assert(!"not reached");
1830 }
1831 
1832 void
visit(ir_texture * ir)1833 vec4_visitor::visit(ir_texture *ir)
1834 {
1835    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1836 
1837    /* Should be lowered by do_lower_texture_projection */
1838    assert(!ir->projector);
1839 
1840    /* Generate code to compute all the subexpression trees.  This has to be
1841     * done before loading any values into MRFs for the sampler message since
1842     * generating these values may involve SEND messages that need the MRFs.
1843     */
1844    src_reg coordinate;
1845    if (ir->coordinate) {
1846       ir->coordinate->accept(this);
1847       coordinate = this->result;
1848    }
1849 
1850    src_reg shadow_comparitor;
1851    if (ir->shadow_comparitor) {
1852       ir->shadow_comparitor->accept(this);
1853       shadow_comparitor = this->result;
1854    }
1855 
1856    const glsl_type *lod_type;
1857    src_reg lod, dPdx, dPdy;
1858    switch (ir->op) {
1859    case ir_tex:
1860       lod = src_reg(0.0f);
1861       lod_type = glsl_type::float_type;
1862       break;
1863    case ir_txf:
1864    case ir_txl:
1865    case ir_txs:
1866       ir->lod_info.lod->accept(this);
1867       lod = this->result;
1868       lod_type = ir->lod_info.lod->type;
1869       break;
1870    case ir_txd:
1871       ir->lod_info.grad.dPdx->accept(this);
1872       dPdx = this->result;
1873 
1874       ir->lod_info.grad.dPdy->accept(this);
1875       dPdy = this->result;
1876 
1877       lod_type = ir->lod_info.grad.dPdx->type;
1878       break;
1879    case ir_txb:
1880       break;
1881    }
1882 
1883    vec4_instruction *inst = NULL;
1884    switch (ir->op) {
1885    case ir_tex:
1886    case ir_txl:
1887       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1888       break;
1889    case ir_txd:
1890       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1891       break;
1892    case ir_txf:
1893       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1894       break;
1895    case ir_txs:
1896       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1897       break;
1898    case ir_txb:
1899       assert(!"TXB is not valid for vertex shaders.");
1900    }
1901 
1902    /* Texel offsets go in the message header; Gen4 also requires headers. */
1903    inst->header_present = ir->offset || intel->gen < 5;
1904    inst->base_mrf = 2;
1905    inst->mlen = inst->header_present + 1; /* always at least one */
1906    inst->sampler = sampler;
1907    inst->dst = dst_reg(this, ir->type);
1908    inst->dst.writemask = WRITEMASK_XYZW;
1909    inst->shadow_compare = ir->shadow_comparitor != NULL;
1910 
1911    if (ir->offset != NULL && ir->op != ir_txf)
1912       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1913 
1914    /* MRF for the first parameter */
1915    int param_base = inst->base_mrf + inst->header_present;
1916 
1917    if (ir->op == ir_txs) {
1918       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1919       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
1920    } else {
1921       int i, coord_mask = 0, zero_mask = 0;
1922       /* Load the coordinate */
1923       /* FINISHME: gl_clamp_mask and saturate */
1924       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1925 	 coord_mask |= (1 << i);
1926       for (; i < 4; i++)
1927 	 zero_mask |= (1 << i);
1928 
1929       if (ir->offset && ir->op == ir_txf) {
1930 	 /* It appears that the ld instruction used for txf does its
1931 	  * address bounds check before adding in the offset.  To work
1932 	  * around this, just add the integer offset to the integer
1933 	  * texel coordinate, and don't put the offset in the header.
1934 	  */
1935 	 ir_constant *offset = ir->offset->as_constant();
1936 	 assert(offset);
1937 
1938 	 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1939 	    src_reg src = coordinate;
1940 	    src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1941 				       BRW_GET_SWZ(src.swizzle, j),
1942 				       BRW_GET_SWZ(src.swizzle, j),
1943 				       BRW_GET_SWZ(src.swizzle, j));
1944 	    emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1945 		     src, offset->value.i[j]));
1946 	 }
1947       } else {
1948 	 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1949 		  coordinate));
1950       }
1951       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1952 	       src_reg(0)));
1953       /* Load the shadow comparitor */
1954       if (ir->shadow_comparitor) {
1955 	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1956 			  WRITEMASK_X),
1957 		  shadow_comparitor));
1958 	 inst->mlen++;
1959       }
1960 
1961       /* Load the LOD info */
1962       if (ir->op == ir_tex || ir->op == ir_txl) {
1963 	 int mrf, writemask;
1964 	 if (intel->gen >= 5) {
1965 	    mrf = param_base + 1;
1966 	    if (ir->shadow_comparitor) {
1967 	       writemask = WRITEMASK_Y;
1968 	       /* mlen already incremented */
1969 	    } else {
1970 	       writemask = WRITEMASK_X;
1971 	       inst->mlen++;
1972 	    }
1973 	 } else /* intel->gen == 4 */ {
1974 	    mrf = param_base;
1975 	    writemask = WRITEMASK_Z;
1976 	 }
1977 	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
1978       } else if (ir->op == ir_txf) {
1979 	 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
1980 		  lod));
1981       } else if (ir->op == ir_txd) {
1982 	 const glsl_type *type = lod_type;
1983 
1984 	 if (intel->gen >= 5) {
1985 	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1986 	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1987 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1988 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1989 	    inst->mlen++;
1990 
1991 	    if (ir->type->vector_elements == 3) {
1992 	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1993 	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1994 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1995 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1996 	       inst->mlen++;
1997 	    }
1998 	 } else /* intel->gen == 4 */ {
1999 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2000 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2001 	    inst->mlen += 2;
2002 	 }
2003       }
2004    }
2005 
2006    emit(inst);
2007 
2008    swizzle_result(ir, src_reg(inst->dst), sampler);
2009 }
2010 
2011 void
swizzle_result(ir_texture * ir,src_reg orig_val,int sampler)2012 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2013 {
2014    int s = c->key.tex.swizzles[sampler];
2015 
2016    this->result = src_reg(this, ir->type);
2017    dst_reg swizzled_result(this->result);
2018 
2019    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2020 			|| s == SWIZZLE_NOOP) {
2021       emit(MOV(swizzled_result, orig_val));
2022       return;
2023    }
2024 
2025    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2026    int swizzle[4];
2027 
2028    for (int i = 0; i < 4; i++) {
2029       switch (GET_SWZ(s, i)) {
2030       case SWIZZLE_ZERO:
2031 	 zero_mask |= (1 << i);
2032 	 break;
2033       case SWIZZLE_ONE:
2034 	 one_mask |= (1 << i);
2035 	 break;
2036       default:
2037 	 copy_mask |= (1 << i);
2038 	 swizzle[i] = GET_SWZ(s, i);
2039 	 break;
2040       }
2041    }
2042 
2043    if (copy_mask) {
2044       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2045       swizzled_result.writemask = copy_mask;
2046       emit(MOV(swizzled_result, orig_val));
2047    }
2048 
2049    if (zero_mask) {
2050       swizzled_result.writemask = zero_mask;
2051       emit(MOV(swizzled_result, src_reg(0.0f)));
2052    }
2053 
2054    if (one_mask) {
2055       swizzled_result.writemask = one_mask;
2056       emit(MOV(swizzled_result, src_reg(1.0f)));
2057    }
2058 }
2059 
2060 void
visit(ir_return * ir)2061 vec4_visitor::visit(ir_return *ir)
2062 {
2063    assert(!"not reached");
2064 }
2065 
2066 void
visit(ir_discard * ir)2067 vec4_visitor::visit(ir_discard *ir)
2068 {
2069    assert(!"not reached");
2070 }
2071 
2072 void
visit(ir_if * ir)2073 vec4_visitor::visit(ir_if *ir)
2074 {
2075    /* Don't point the annotation at the if statement, because then it plus
2076     * the then and else blocks get printed.
2077     */
2078    this->base_ir = ir->condition;
2079 
2080    if (intel->gen == 6) {
2081       emit_if_gen6(ir);
2082    } else {
2083       uint32_t predicate;
2084       emit_bool_to_cond_code(ir->condition, &predicate);
2085       emit(IF(predicate));
2086    }
2087 
2088    visit_instructions(&ir->then_instructions);
2089 
2090    if (!ir->else_instructions.is_empty()) {
2091       this->base_ir = ir->condition;
2092       emit(BRW_OPCODE_ELSE);
2093 
2094       visit_instructions(&ir->else_instructions);
2095    }
2096 
2097    this->base_ir = ir->condition;
2098    emit(BRW_OPCODE_ENDIF);
2099 }
2100 
2101 void
emit_ndc_computation()2102 vec4_visitor::emit_ndc_computation()
2103 {
2104    /* Get the position */
2105    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2106 
2107    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2108    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2109    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2110 
2111    current_annotation = "NDC";
2112    dst_reg ndc_w = ndc;
2113    ndc_w.writemask = WRITEMASK_W;
2114    src_reg pos_w = pos;
2115    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2116    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2117 
2118    dst_reg ndc_xyz = ndc;
2119    ndc_xyz.writemask = WRITEMASK_XYZ;
2120 
2121    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2122 }
2123 
2124 void
emit_psiz_and_flags(struct brw_reg reg)2125 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2126 {
2127    if (intel->gen < 6 &&
2128        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2129         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2130       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2131       dst_reg header1_w = header1;
2132       header1_w.writemask = WRITEMASK_W;
2133       GLuint i;
2134 
2135       emit(MOV(header1, 0u));
2136 
2137       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2138 	 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2139 
2140 	 current_annotation = "Point size";
2141 	 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2142 	 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2143       }
2144 
2145       current_annotation = "Clipping flags";
2146       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2147 	 vec4_instruction *inst;
2148 
2149 	 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2150                          src_reg(this->userplane[i])));
2151 	 inst->conditional_mod = BRW_CONDITIONAL_L;
2152 
2153 	 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2154 	 inst->predicate = BRW_PREDICATE_NORMAL;
2155       }
2156 
2157       /* i965 clipping workaround:
2158        * 1) Test for -ve rhw
2159        * 2) If set,
2160        *      set ndc = (0,0,0,0)
2161        *      set ucp[6] = 1
2162        *
2163        * Later, clipping will detect ucp[6] and ensure the primitive is
2164        * clipped against all fixed planes.
2165        */
2166       if (brw->has_negative_rhw_bug) {
2167 #if 0
2168 	 /* FINISHME */
2169 	 brw_CMP(p,
2170 		 vec8(brw_null_reg()),
2171 		 BRW_CONDITIONAL_L,
2172 		 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2173 		 brw_imm_f(0));
2174 
2175 	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2176 	 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2177 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2178 #endif
2179       }
2180 
2181       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2182    } else if (intel->gen < 6) {
2183       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2184    } else {
2185       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2186       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2187          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2188                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2189       }
2190    }
2191 }
2192 
2193 void
emit_clip_distances(struct brw_reg reg,int offset)2194 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2195 {
2196    if (intel->gen < 6) {
2197       /* Clip distance slots are set aside in gen5, but they are not used.  It
2198        * is not clear whether we actually need to set aside space for them,
2199        * but the performance cost is negligible.
2200        */
2201       return;
2202    }
2203 
2204    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2205     *
2206     *     "If a linked set of shaders forming the vertex stage contains no
2207     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2208     *     application has requested clipping against user clip planes through
2209     *     the API, then the coordinate written to gl_Position is used for
2210     *     comparison against the user clip planes."
2211     *
2212     * This function is only called if the shader didn't write to
2213     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2214     * if the user wrote to it; otherwise we use gl_Position.
2215     */
2216    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2217    if (!(c->prog_data.outputs_written
2218          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2219       clip_vertex = VERT_RESULT_HPOS;
2220    }
2221 
2222    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2223         ++i) {
2224       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2225                src_reg(output_reg[clip_vertex]),
2226                src_reg(this->userplane[i + offset])));
2227    }
2228 }
2229 
2230 void
emit_generic_urb_slot(dst_reg reg,int vert_result)2231 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2232 {
2233    assert (vert_result < VERT_RESULT_MAX);
2234    reg.type = output_reg[vert_result].type;
2235    current_annotation = output_reg_annotation[vert_result];
2236    /* Copy the register, saturating if necessary */
2237    vec4_instruction *inst = emit(MOV(reg,
2238                                      src_reg(output_reg[vert_result])));
2239    if ((vert_result == VERT_RESULT_COL0 ||
2240         vert_result == VERT_RESULT_COL1 ||
2241         vert_result == VERT_RESULT_BFC0 ||
2242         vert_result == VERT_RESULT_BFC1) &&
2243        c->key.clamp_vertex_color) {
2244       inst->saturate = true;
2245    }
2246 }
2247 
2248 void
emit_urb_slot(int mrf,int vert_result)2249 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2250 {
2251    struct brw_reg hw_reg = brw_message_reg(mrf);
2252    dst_reg reg = dst_reg(MRF, mrf);
2253    reg.type = BRW_REGISTER_TYPE_F;
2254 
2255    switch (vert_result) {
2256    case VERT_RESULT_PSIZ:
2257       /* PSIZ is always in slot 0, and is coupled with other flags. */
2258       current_annotation = "indices, point width, clip flags";
2259       emit_psiz_and_flags(hw_reg);
2260       break;
2261    case BRW_VERT_RESULT_NDC:
2262       current_annotation = "NDC";
2263       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2264       break;
2265    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2266    case VERT_RESULT_HPOS:
2267       current_annotation = "gl_Position";
2268       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2269       break;
2270    case VERT_RESULT_CLIP_DIST0:
2271    case VERT_RESULT_CLIP_DIST1:
2272       if (this->c->key.uses_clip_distance) {
2273          emit_generic_urb_slot(reg, vert_result);
2274       } else {
2275          current_annotation = "user clip distances";
2276          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2277       }
2278       break;
2279    case VERT_RESULT_EDGE:
2280       /* This is present when doing unfilled polygons.  We're supposed to copy
2281        * the edge flag from the user-provided vertex array
2282        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2283        * of that attribute (starts as 1.0f).  This is then used in clipping to
2284        * determine which edges should be drawn as wireframe.
2285        */
2286       current_annotation = "edge flag";
2287       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2288                                     glsl_type::float_type, WRITEMASK_XYZW))));
2289       break;
2290    case BRW_VERT_RESULT_PAD:
2291       /* No need to write to this slot */
2292       break;
2293    default:
2294       emit_generic_urb_slot(reg, vert_result);
2295       break;
2296    }
2297 }
2298 
2299 static int
align_interleaved_urb_mlen(struct brw_context * brw,int mlen)2300 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2301 {
2302    struct intel_context *intel = &brw->intel;
2303 
2304    if (intel->gen >= 6) {
2305       /* URB data written (does not include the message header reg) must
2306        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2307        * section 5.4.3.2.2: URB_INTERLEAVED.
2308        *
2309        * URB entries are allocated on a multiple of 1024 bits, so an
2310        * extra 128 bits written here to make the end align to 256 is
2311        * no problem.
2312        */
2313       if ((mlen % 2) != 1)
2314 	 mlen++;
2315    }
2316 
2317    return mlen;
2318 }
2319 
2320 /**
2321  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2322  * complete the VS thread.
2323  *
2324  * The VUE layout is documented in Volume 2a.
2325  */
2326 void
emit_urb_writes()2327 vec4_visitor::emit_urb_writes()
2328 {
2329    /* MRF 0 is reserved for the debugger, so start with message header
2330     * in MRF 1.
2331     */
2332    int base_mrf = 1;
2333    int mrf = base_mrf;
2334    /* In the process of generating our URB write message contents, we
2335     * may need to unspill a register or load from an array.  Those
2336     * reads would use MRFs 14-15.
2337     */
2338    int max_usable_mrf = 13;
2339 
2340    /* The following assertion verifies that max_usable_mrf causes an
2341     * even-numbered amount of URB write data, which will meet gen6's
2342     * requirements for length alignment.
2343     */
2344    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2345 
2346    /* First mrf is the g0-based message header containing URB handles and such,
2347     * which is implied in VS_OPCODE_URB_WRITE.
2348     */
2349    mrf++;
2350 
2351    if (intel->gen < 6) {
2352       emit_ndc_computation();
2353    }
2354 
2355    /* Set up the VUE data for the first URB write */
2356    int slot;
2357    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2358       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2359 
2360       /* If this was max_usable_mrf, we can't fit anything more into this URB
2361        * WRITE.
2362        */
2363       if (mrf > max_usable_mrf) {
2364 	 slot++;
2365 	 break;
2366       }
2367    }
2368 
2369    current_annotation = "URB write";
2370    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2371    inst->base_mrf = base_mrf;
2372    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2373    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2374 
2375    /* Optional second URB write */
2376    if (!inst->eot) {
2377       mrf = base_mrf + 1;
2378 
2379       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2380 	 assert(mrf < max_usable_mrf);
2381 
2382          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2383       }
2384 
2385       current_annotation = "URB write";
2386       inst = emit(VS_OPCODE_URB_WRITE);
2387       inst->base_mrf = base_mrf;
2388       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2389       inst->eot = true;
2390       /* URB destination offset.  In the previous write, we got MRFs
2391        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2392        * URB row increments, and each of our MRFs is half of one of
2393        * those, since we're doing interleaved writes.
2394        */
2395       inst->offset = (max_usable_mrf - base_mrf) / 2;
2396    }
2397 }
2398 
2399 src_reg
get_scratch_offset(vec4_instruction * inst,src_reg * reladdr,int reg_offset)2400 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2401 				 src_reg *reladdr, int reg_offset)
2402 {
2403    /* Because we store the values to scratch interleaved like our
2404     * vertex data, we need to scale the vec4 index by 2.
2405     */
2406    int message_header_scale = 2;
2407 
2408    /* Pre-gen6, the message header uses byte offsets instead of vec4
2409     * (16-byte) offset units.
2410     */
2411    if (intel->gen < 6)
2412       message_header_scale *= 16;
2413 
2414    if (reladdr) {
2415       src_reg index = src_reg(this, glsl_type::int_type);
2416 
2417       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2418       emit_before(inst, MUL(dst_reg(index),
2419 			    index, src_reg(message_header_scale)));
2420 
2421       return index;
2422    } else {
2423       return src_reg(reg_offset * message_header_scale);
2424    }
2425 }
2426 
2427 src_reg
get_pull_constant_offset(vec4_instruction * inst,src_reg * reladdr,int reg_offset)2428 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2429 				       src_reg *reladdr, int reg_offset)
2430 {
2431    if (reladdr) {
2432       src_reg index = src_reg(this, glsl_type::int_type);
2433 
2434       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2435 
2436       /* Pre-gen6, the message header uses byte offsets instead of vec4
2437        * (16-byte) offset units.
2438        */
2439       if (intel->gen < 6) {
2440 	 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2441       }
2442 
2443       return index;
2444    } else {
2445       int message_header_scale = intel->gen < 6 ? 16 : 1;
2446       return src_reg(reg_offset * message_header_scale);
2447    }
2448 }
2449 
2450 /**
2451  * Emits an instruction before @inst to load the value named by @orig_src
2452  * from scratch space at @base_offset to @temp.
2453  *
2454  * @base_offset is measured in 32-byte units (the size of a register).
2455  */
2456 void
emit_scratch_read(vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)2457 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2458 				dst_reg temp, src_reg orig_src,
2459 				int base_offset)
2460 {
2461    int reg_offset = base_offset + orig_src.reg_offset;
2462    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2463 
2464    emit_before(inst, SCRATCH_READ(temp, index));
2465 }
2466 
2467 /**
2468  * Emits an instruction after @inst to store the value to be written
2469  * to @orig_dst to scratch space at @base_offset, from @temp.
2470  *
2471  * @base_offset is measured in 32-byte units (the size of a register).
2472  */
2473 void
emit_scratch_write(vec4_instruction * inst,src_reg temp,dst_reg orig_dst,int base_offset)2474 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2475 				 src_reg temp, dst_reg orig_dst,
2476 				 int base_offset)
2477 {
2478    int reg_offset = base_offset + orig_dst.reg_offset;
2479    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2480 
2481    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2482 				       orig_dst.writemask));
2483    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2484    write->predicate = inst->predicate;
2485    write->ir = inst->ir;
2486    write->annotation = inst->annotation;
2487    inst->insert_after(write);
2488 }
2489 
2490 /**
2491  * We can't generally support array access in GRF space, because a
2492  * single instruction's destination can only span 2 contiguous
2493  * registers.  So, we send all GRF arrays that get variable index
2494  * access to scratch space.
2495  */
2496 void
move_grf_array_access_to_scratch()2497 vec4_visitor::move_grf_array_access_to_scratch()
2498 {
2499    int scratch_loc[this->virtual_grf_count];
2500 
2501    for (int i = 0; i < this->virtual_grf_count; i++) {
2502       scratch_loc[i] = -1;
2503    }
2504 
2505    /* First, calculate the set of virtual GRFs that need to be punted
2506     * to scratch due to having any array access on them, and where in
2507     * scratch.
2508     */
2509    foreach_list(node, &this->instructions) {
2510       vec4_instruction *inst = (vec4_instruction *)node;
2511 
2512       if (inst->dst.file == GRF && inst->dst.reladdr &&
2513 	  scratch_loc[inst->dst.reg] == -1) {
2514 	 scratch_loc[inst->dst.reg] = c->last_scratch;
2515 	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2516       }
2517 
2518       for (int i = 0 ; i < 3; i++) {
2519 	 src_reg *src = &inst->src[i];
2520 
2521 	 if (src->file == GRF && src->reladdr &&
2522 	     scratch_loc[src->reg] == -1) {
2523 	    scratch_loc[src->reg] = c->last_scratch;
2524 	    c->last_scratch += this->virtual_grf_sizes[src->reg];
2525 	 }
2526       }
2527    }
2528 
2529    /* Now, for anything that will be accessed through scratch, rewrite
2530     * it to load/store.  Note that this is a _safe list walk, because
2531     * we may generate a new scratch_write instruction after the one
2532     * we're processing.
2533     */
2534    foreach_list_safe(node, &this->instructions) {
2535       vec4_instruction *inst = (vec4_instruction *)node;
2536 
2537       /* Set up the annotation tracking for new generated instructions. */
2538       base_ir = inst->ir;
2539       current_annotation = inst->annotation;
2540 
2541       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2542 	 src_reg temp = src_reg(this, glsl_type::vec4_type);
2543 
2544 	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2545 
2546 	 inst->dst.file = temp.file;
2547 	 inst->dst.reg = temp.reg;
2548 	 inst->dst.reg_offset = temp.reg_offset;
2549 	 inst->dst.reladdr = NULL;
2550       }
2551 
2552       for (int i = 0 ; i < 3; i++) {
2553 	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2554 	    continue;
2555 
2556 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2557 
2558 	 emit_scratch_read(inst, temp, inst->src[i],
2559 			   scratch_loc[inst->src[i].reg]);
2560 
2561 	 inst->src[i].file = temp.file;
2562 	 inst->src[i].reg = temp.reg;
2563 	 inst->src[i].reg_offset = temp.reg_offset;
2564 	 inst->src[i].reladdr = NULL;
2565       }
2566    }
2567 }
2568 
2569 /**
2570  * Emits an instruction before @inst to load the value named by @orig_src
2571  * from the pull constant buffer (surface) at @base_offset to @temp.
2572  */
2573 void
emit_pull_constant_load(vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)2574 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2575 				      dst_reg temp, src_reg orig_src,
2576 				      int base_offset)
2577 {
2578    int reg_offset = base_offset + orig_src.reg_offset;
2579    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2580    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2581    vec4_instruction *load;
2582 
2583    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2584 					temp, index, offset);
2585    load->base_mrf = 14;
2586    load->mlen = 1;
2587    emit_before(inst, load);
2588 }
2589 
2590 /**
2591  * Implements array access of uniforms by inserting a
2592  * PULL_CONSTANT_LOAD instruction.
2593  *
2594  * Unlike temporary GRF array access (where we don't support it due to
2595  * the difficulty of doing relative addressing on instruction
2596  * destinations), we could potentially do array access of uniforms
2597  * that were loaded in GRF space as push constants.  In real-world
2598  * usage we've seen, though, the arrays being used are always larger
2599  * than we could load as push constants, so just always move all
2600  * uniform array access out to a pull constant buffer.
2601  */
2602 void
move_uniform_array_access_to_pull_constants()2603 vec4_visitor::move_uniform_array_access_to_pull_constants()
2604 {
2605    int pull_constant_loc[this->uniforms];
2606 
2607    for (int i = 0; i < this->uniforms; i++) {
2608       pull_constant_loc[i] = -1;
2609    }
2610 
2611    /* Walk through and find array access of uniforms.  Put a copy of that
2612     * uniform in the pull constant buffer.
2613     *
2614     * Note that we don't move constant-indexed accesses to arrays.  No
2615     * testing has been done of the performance impact of this choice.
2616     */
2617    foreach_list_safe(node, &this->instructions) {
2618       vec4_instruction *inst = (vec4_instruction *)node;
2619 
2620       for (int i = 0 ; i < 3; i++) {
2621 	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2622 	    continue;
2623 
2624 	 int uniform = inst->src[i].reg;
2625 
2626 	 /* If this array isn't already present in the pull constant buffer,
2627 	  * add it.
2628 	  */
2629 	 if (pull_constant_loc[uniform] == -1) {
2630 	    const float **values = &prog_data->param[uniform * 4];
2631 
2632 	    pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2633 
2634 	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2635 	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2636 	    }
2637 	 }
2638 
2639 	 /* Set up the annotation tracking for new generated instructions. */
2640 	 base_ir = inst->ir;
2641 	 current_annotation = inst->annotation;
2642 
2643 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2644 
2645 	 emit_pull_constant_load(inst, temp, inst->src[i],
2646 				 pull_constant_loc[uniform]);
2647 
2648 	 inst->src[i].file = temp.file;
2649 	 inst->src[i].reg = temp.reg;
2650 	 inst->src[i].reg_offset = temp.reg_offset;
2651 	 inst->src[i].reladdr = NULL;
2652       }
2653    }
2654 
2655    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2656     * no need to track them as larger-than-vec4 objects.  This will be
2657     * relied on in cutting out unused uniform vectors from push
2658     * constants.
2659     */
2660    split_uniform_registers();
2661 }
2662 
2663 void
resolve_ud_negate(src_reg * reg)2664 vec4_visitor::resolve_ud_negate(src_reg *reg)
2665 {
2666    if (reg->type != BRW_REGISTER_TYPE_UD ||
2667        !reg->negate)
2668       return;
2669 
2670    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2671    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2672    *reg = temp;
2673 }
2674 
vec4_visitor(struct brw_vs_compile * c,struct gl_shader_program * prog,struct brw_shader * shader)2675 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2676 			   struct gl_shader_program *prog,
2677 			   struct brw_shader *shader)
2678 {
2679    this->c = c;
2680    this->p = &c->func;
2681    this->brw = p->brw;
2682    this->intel = &brw->intel;
2683    this->ctx = &intel->ctx;
2684    this->prog = prog;
2685    this->shader = shader;
2686 
2687    this->mem_ctx = ralloc_context(NULL);
2688    this->failed = false;
2689 
2690    this->base_ir = NULL;
2691    this->current_annotation = NULL;
2692 
2693    this->c = c;
2694    this->vp = (struct gl_vertex_program *)
2695      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2696    this->prog_data = &c->prog_data;
2697 
2698    this->variable_ht = hash_table_ctor(0,
2699 				       hash_table_pointer_hash,
2700 				       hash_table_pointer_compare);
2701 
2702    this->virtual_grf_def = NULL;
2703    this->virtual_grf_use = NULL;
2704    this->virtual_grf_sizes = NULL;
2705    this->virtual_grf_count = 0;
2706    this->virtual_grf_reg_map = NULL;
2707    this->virtual_grf_reg_count = 0;
2708    this->virtual_grf_array_size = 0;
2709    this->live_intervals_valid = false;
2710 
2711    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2712 
2713    this->uniforms = 0;
2714 }
2715 
~vec4_visitor()2716 vec4_visitor::~vec4_visitor()
2717 {
2718    ralloc_free(this->mem_ctx);
2719    hash_table_dtor(this->variable_ht);
2720 }
2721 
2722 
2723 void
fail(const char * format,...)2724 vec4_visitor::fail(const char *format, ...)
2725 {
2726    va_list va;
2727    char *msg;
2728 
2729    if (failed)
2730       return;
2731 
2732    failed = true;
2733 
2734    va_start(va, format);
2735    msg = ralloc_vasprintf(mem_ctx, format, va);
2736    va_end(va);
2737    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2738 
2739    this->fail_msg = msg;
2740 
2741    if (INTEL_DEBUG & DEBUG_VS) {
2742       fprintf(stderr, "%s",  msg);
2743    }
2744 }
2745 
2746 } /* namespace brw */
2747