1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 extern "C" {
26 #include "main/macros.h"
27 #include "program/prog_parameter.h"
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
vec4_instruction(vec4_visitor * v,enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1,src_reg src2)33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, dst_reg dst,
35 src_reg src0, src_reg src1, src_reg src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->ir = v->base_ir;
43 this->annotation = v->current_annotation;
44 }
45
46 vec4_instruction *
emit(vec4_instruction * inst)47 vec4_visitor::emit(vec4_instruction *inst)
48 {
49 this->instructions.push_tail(inst);
50
51 return inst;
52 }
53
54 vec4_instruction *
emit_before(vec4_instruction * inst,vec4_instruction * new_inst)55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
56 {
57 new_inst->ir = inst->ir;
58 new_inst->annotation = inst->annotation;
59
60 inst->insert_before(new_inst);
61
62 return inst;
63 }
64
65 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1,src_reg src2)66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
67 src_reg src0, src_reg src1, src_reg src2)
68 {
69 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
70 src0, src1, src2));
71 }
72
73
74 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
76 {
77 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
78 }
79
80 vec4_instruction *
emit(enum opcode opcode,dst_reg dst,src_reg src0)81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
84 }
85
86 vec4_instruction *
emit(enum opcode opcode)87 vec4_visitor::emit(enum opcode opcode)
88 {
89 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
90 }
91
92 #define ALU1(op) \
93 vec4_instruction * \
94 vec4_visitor::op(dst_reg dst, src_reg src0) \
95 { \
96 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
97 src0); \
98 }
99
100 #define ALU2(op) \
101 vec4_instruction * \
102 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
103 { \
104 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
105 src0, src1); \
106 }
107
108 ALU1(NOT)
ALU1(MOV)109 ALU1(MOV)
110 ALU1(FRC)
111 ALU1(RNDD)
112 ALU1(RNDE)
113 ALU1(RNDZ)
114 ALU2(ADD)
115 ALU2(MUL)
116 ALU2(MACH)
117 ALU2(AND)
118 ALU2(OR)
119 ALU2(XOR)
120 ALU2(DP3)
121 ALU2(DP4)
122
123 /** Gen4 predicated IF. */
124 vec4_instruction *
125 vec4_visitor::IF(uint32_t predicate)
126 {
127 vec4_instruction *inst;
128
129 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
130 inst->predicate = predicate;
131
132 return inst;
133 }
134
135 /** Gen6+ IF with embedded comparison. */
136 vec4_instruction *
IF(src_reg src0,src_reg src1,uint32_t condition)137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
138 {
139 assert(intel->gen >= 6);
140
141 vec4_instruction *inst;
142
143 resolve_ud_negate(&src0);
144 resolve_ud_negate(&src1);
145
146 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
147 src0, src1);
148 inst->conditional_mod = condition;
149
150 return inst;
151 }
152
153 /**
154 * CMP: Sets the low bit of the destination channels with the result
155 * of the comparison, while the upper bits are undefined, and updates
156 * the flag register with the packed 16 bits of the result.
157 */
158 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,uint32_t condition)159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
160 {
161 vec4_instruction *inst;
162
163 /* original gen4 does type conversion to the destination type
164 * before before comparison, producing garbage results for floating
165 * point comparisons.
166 */
167 if (intel->gen == 4) {
168 dst.type = src0.type;
169 if (dst.file == HW_REG)
170 dst.fixed_hw_reg.type = dst.type;
171 }
172
173 resolve_ud_negate(&src0);
174 resolve_ud_negate(&src1);
175
176 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
177 inst->conditional_mod = condition;
178
179 return inst;
180 }
181
182 vec4_instruction *
SCRATCH_READ(dst_reg dst,src_reg index)183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
184 {
185 vec4_instruction *inst;
186
187 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
188 dst, index);
189 inst->base_mrf = 14;
190 inst->mlen = 1;
191
192 return inst;
193 }
194
195 vec4_instruction *
SCRATCH_WRITE(dst_reg dst,src_reg src,src_reg index)196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
197 {
198 vec4_instruction *inst;
199
200 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
201 dst, src, index);
202 inst->base_mrf = 13;
203 inst->mlen = 2;
204
205 return inst;
206 }
207
208 void
emit_dp(dst_reg dst,src_reg src0,src_reg src1,unsigned elements)209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
210 {
211 static enum opcode dot_opcodes[] = {
212 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
213 };
214
215 emit(dot_opcodes[elements - 2], dst, src0, src1);
216 }
217
218 void
emit_math1_gen6(enum opcode opcode,dst_reg dst,src_reg src)219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
220 {
221 /* The gen6 math instruction ignores the source modifiers --
222 * swizzle, abs, negate, and at least some parts of the register
223 * region description.
224 *
225 * While it would seem that this MOV could be avoided at this point
226 * in the case that the swizzle is matched up with the destination
227 * writemask, note that uniform packing and register allocation
228 * could rearrange our swizzle, so let's leave this matter up to
229 * copy propagation later.
230 */
231 src_reg temp_src = src_reg(this, glsl_type::vec4_type);
232 emit(MOV(dst_reg(temp_src), src));
233
234 if (dst.writemask != WRITEMASK_XYZW) {
235 /* The gen6 math instruction must be align1, so we can't do
236 * writemasks.
237 */
238 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
239
240 emit(opcode, temp_dst, temp_src);
241
242 emit(MOV(dst, src_reg(temp_dst)));
243 } else {
244 emit(opcode, dst, temp_src);
245 }
246 }
247
248 void
emit_math1_gen4(enum opcode opcode,dst_reg dst,src_reg src)249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
250 {
251 vec4_instruction *inst = emit(opcode, dst, src);
252 inst->base_mrf = 1;
253 inst->mlen = 1;
254 }
255
256 void
emit_math(opcode opcode,dst_reg dst,src_reg src)257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
258 {
259 switch (opcode) {
260 case SHADER_OPCODE_RCP:
261 case SHADER_OPCODE_RSQ:
262 case SHADER_OPCODE_SQRT:
263 case SHADER_OPCODE_EXP2:
264 case SHADER_OPCODE_LOG2:
265 case SHADER_OPCODE_SIN:
266 case SHADER_OPCODE_COS:
267 break;
268 default:
269 assert(!"not reached: bad math opcode");
270 return;
271 }
272
273 if (intel->gen >= 7) {
274 emit(opcode, dst, src);
275 } else if (intel->gen == 6) {
276 return emit_math1_gen6(opcode, dst, src);
277 } else {
278 return emit_math1_gen4(opcode, dst, src);
279 }
280 }
281
282 void
emit_math2_gen6(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
284 dst_reg dst, src_reg src0, src_reg src1)
285 {
286 src_reg expanded;
287
288 /* The gen6 math instruction ignores the source modifiers --
289 * swizzle, abs, negate, and at least some parts of the register
290 * region description. Move the sources to temporaries to make it
291 * generally work.
292 */
293
294 expanded = src_reg(this, glsl_type::vec4_type);
295 expanded.type = src0.type;
296 emit(MOV(dst_reg(expanded), src0));
297 src0 = expanded;
298
299 expanded = src_reg(this, glsl_type::vec4_type);
300 expanded.type = src1.type;
301 emit(MOV(dst_reg(expanded), src1));
302 src1 = expanded;
303
304 if (dst.writemask != WRITEMASK_XYZW) {
305 /* The gen6 math instruction must be align1, so we can't do
306 * writemasks.
307 */
308 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
309 temp_dst.type = dst.type;
310
311 emit(opcode, temp_dst, src0, src1);
312
313 emit(MOV(dst, src_reg(temp_dst)));
314 } else {
315 emit(opcode, dst, src0, src1);
316 }
317 }
318
319 void
emit_math2_gen4(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
321 dst_reg dst, src_reg src0, src_reg src1)
322 {
323 vec4_instruction *inst = emit(opcode, dst, src0, src1);
324 inst->base_mrf = 1;
325 inst->mlen = 2;
326 }
327
328 void
emit_math(enum opcode opcode,dst_reg dst,src_reg src0,src_reg src1)329 vec4_visitor::emit_math(enum opcode opcode,
330 dst_reg dst, src_reg src0, src_reg src1)
331 {
332 switch (opcode) {
333 case SHADER_OPCODE_POW:
334 case SHADER_OPCODE_INT_QUOTIENT:
335 case SHADER_OPCODE_INT_REMAINDER:
336 break;
337 default:
338 assert(!"not reached: unsupported binary math opcode");
339 return;
340 }
341
342 if (intel->gen >= 7) {
343 emit(opcode, dst, src0, src1);
344 } else if (intel->gen == 6) {
345 return emit_math2_gen6(opcode, dst, src0, src1);
346 } else {
347 return emit_math2_gen4(opcode, dst, src0, src1);
348 }
349 }
350
351 void
visit_instructions(const exec_list * list)352 vec4_visitor::visit_instructions(const exec_list *list)
353 {
354 foreach_list(node, list) {
355 ir_instruction *ir = (ir_instruction *)node;
356
357 base_ir = ir;
358 ir->accept(this);
359 }
360 }
361
362
363 static int
type_size(const struct glsl_type * type)364 type_size(const struct glsl_type *type)
365 {
366 unsigned int i;
367 int size;
368
369 switch (type->base_type) {
370 case GLSL_TYPE_UINT:
371 case GLSL_TYPE_INT:
372 case GLSL_TYPE_FLOAT:
373 case GLSL_TYPE_BOOL:
374 if (type->is_matrix()) {
375 return type->matrix_columns;
376 } else {
377 /* Regardless of size of vector, it gets a vec4. This is bad
378 * packing for things like floats, but otherwise arrays become a
379 * mess. Hopefully a later pass over the code can pack scalars
380 * down if appropriate.
381 */
382 return 1;
383 }
384 case GLSL_TYPE_ARRAY:
385 assert(type->length > 0);
386 return type_size(type->fields.array) * type->length;
387 case GLSL_TYPE_STRUCT:
388 size = 0;
389 for (i = 0; i < type->length; i++) {
390 size += type_size(type->fields.structure[i].type);
391 }
392 return size;
393 case GLSL_TYPE_SAMPLER:
394 /* Samplers take up one slot in UNIFORMS[], but they're baked in
395 * at link time.
396 */
397 return 1;
398 default:
399 assert(0);
400 return 0;
401 }
402 }
403
404 int
virtual_grf_alloc(int size)405 vec4_visitor::virtual_grf_alloc(int size)
406 {
407 if (virtual_grf_array_size <= virtual_grf_count) {
408 if (virtual_grf_array_size == 0)
409 virtual_grf_array_size = 16;
410 else
411 virtual_grf_array_size *= 2;
412 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
413 virtual_grf_array_size);
414 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
415 virtual_grf_array_size);
416 }
417 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
418 virtual_grf_reg_count += size;
419 virtual_grf_sizes[virtual_grf_count] = size;
420 return virtual_grf_count++;
421 }
422
src_reg(class vec4_visitor * v,const struct glsl_type * type)423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
424 {
425 init();
426
427 this->file = GRF;
428 this->reg = v->virtual_grf_alloc(type_size(type));
429
430 if (type->is_array() || type->is_record()) {
431 this->swizzle = BRW_SWIZZLE_NOOP;
432 } else {
433 this->swizzle = swizzle_for_size(type->vector_elements);
434 }
435
436 this->type = brw_type_for_base_type(type);
437 }
438
dst_reg(class vec4_visitor * v,const struct glsl_type * type)439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
440 {
441 init();
442
443 this->file = GRF;
444 this->reg = v->virtual_grf_alloc(type_size(type));
445
446 if (type->is_array() || type->is_record()) {
447 this->writemask = WRITEMASK_XYZW;
448 } else {
449 this->writemask = (1 << type->vector_elements) - 1;
450 }
451
452 this->type = brw_type_for_base_type(type);
453 }
454
455 /* Our support for uniforms is piggy-backed on the struct
456 * gl_fragment_program, because that's where the values actually
457 * get stored, rather than in some global gl_shader_program uniform
458 * store.
459 */
460 int
setup_uniform_values(int loc,const glsl_type * type)461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
462 {
463 unsigned int offset = 0;
464 float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
465
466 if (type->is_matrix()) {
467 const glsl_type *column = type->column_type();
468
469 for (unsigned int i = 0; i < type->matrix_columns; i++) {
470 offset += setup_uniform_values(loc + offset, column);
471 }
472
473 return offset;
474 }
475
476 switch (type->base_type) {
477 case GLSL_TYPE_FLOAT:
478 case GLSL_TYPE_UINT:
479 case GLSL_TYPE_INT:
480 case GLSL_TYPE_BOOL:
481 for (unsigned int i = 0; i < type->vector_elements; i++) {
482 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
483 }
484
485 /* Set up pad elements to get things aligned to a vec4 boundary. */
486 for (unsigned int i = type->vector_elements; i < 4; i++) {
487 static float zero = 0;
488
489 c->prog_data.param[this->uniforms * 4 + i] = &zero;
490 }
491
492 /* Track the size of this uniform vector, for future packing of
493 * uniforms.
494 */
495 this->uniform_vector_size[this->uniforms] = type->vector_elements;
496 this->uniforms++;
497
498 return 1;
499
500 case GLSL_TYPE_STRUCT:
501 for (unsigned int i = 0; i < type->length; i++) {
502 offset += setup_uniform_values(loc + offset,
503 type->fields.structure[i].type);
504 }
505 return offset;
506
507 case GLSL_TYPE_ARRAY:
508 for (unsigned int i = 0; i < type->length; i++) {
509 offset += setup_uniform_values(loc + offset, type->fields.array);
510 }
511 return offset;
512
513 case GLSL_TYPE_SAMPLER:
514 /* The sampler takes up a slot, but we don't use any values from it. */
515 return 1;
516
517 default:
518 assert(!"not reached");
519 return 0;
520 }
521 }
522
523 void
setup_uniform_clipplane_values()524 vec4_visitor::setup_uniform_clipplane_values()
525 {
526 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
527
528 /* Pre-Gen6, we compact clip planes. For example, if the user
529 * enables just clip planes 0, 1, and 3, we will enable clip planes
530 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
531 * plane 2. This simplifies the implementation of the Gen6 clip
532 * thread.
533 *
534 * In Gen6 and later, we don't compact clip planes, because this
535 * simplifies the implementation of gl_ClipDistance.
536 */
537 int compacted_clipplane_index = 0;
538 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
539 if (intel->gen < 6 &&
540 !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
541 continue;
542 }
543 this->uniform_vector_size[this->uniforms] = 4;
544 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
545 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
546 for (int j = 0; j < 4; ++j) {
547 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
548 }
549 ++compacted_clipplane_index;
550 ++this->uniforms;
551 }
552 }
553
554 /* Our support for builtin uniforms is even scarier than non-builtin.
555 * It sits on top of the PROG_STATE_VAR parameters that are
556 * automatically updated from GL context state.
557 */
558 void
setup_builtin_uniform_values(ir_variable * ir)559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
560 {
561 const ir_state_slot *const slots = ir->state_slots;
562 assert(ir->state_slots != NULL);
563
564 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
565 /* This state reference has already been setup by ir_to_mesa,
566 * but we'll get the same index back here. We can reference
567 * ParameterValues directly, since unlike brw_fs.cpp, we never
568 * add new state references during compile.
569 */
570 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
571 (gl_state_index *)slots[i].tokens);
572 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
573
574 this->uniform_vector_size[this->uniforms] = 0;
575 /* Add each of the unique swizzled channels of the element.
576 * This will end up matching the size of the glsl_type of this field.
577 */
578 int last_swiz = -1;
579 for (unsigned int j = 0; j < 4; j++) {
580 int swiz = GET_SWZ(slots[i].swizzle, j);
581 last_swiz = swiz;
582
583 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
584 if (swiz <= last_swiz)
585 this->uniform_vector_size[this->uniforms]++;
586 }
587 this->uniforms++;
588 }
589 }
590
591 dst_reg *
variable_storage(ir_variable * var)592 vec4_visitor::variable_storage(ir_variable *var)
593 {
594 return (dst_reg *)hash_table_find(this->variable_ht, var);
595 }
596
597 void
emit_bool_to_cond_code(ir_rvalue * ir,uint32_t * predicate)598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
599 {
600 ir_expression *expr = ir->as_expression();
601
602 *predicate = BRW_PREDICATE_NORMAL;
603
604 if (expr) {
605 src_reg op[2];
606 vec4_instruction *inst;
607
608 assert(expr->get_num_operands() <= 2);
609 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
610 expr->operands[i]->accept(this);
611 op[i] = this->result;
612
613 resolve_ud_negate(&op[i]);
614 }
615
616 switch (expr->operation) {
617 case ir_unop_logic_not:
618 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
619 inst->conditional_mod = BRW_CONDITIONAL_Z;
620 break;
621
622 case ir_binop_logic_xor:
623 inst = emit(XOR(dst_null_d(), op[0], op[1]));
624 inst->conditional_mod = BRW_CONDITIONAL_NZ;
625 break;
626
627 case ir_binop_logic_or:
628 inst = emit(OR(dst_null_d(), op[0], op[1]));
629 inst->conditional_mod = BRW_CONDITIONAL_NZ;
630 break;
631
632 case ir_binop_logic_and:
633 inst = emit(AND(dst_null_d(), op[0], op[1]));
634 inst->conditional_mod = BRW_CONDITIONAL_NZ;
635 break;
636
637 case ir_unop_f2b:
638 if (intel->gen >= 6) {
639 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
640 } else {
641 inst = emit(MOV(dst_null_f(), op[0]));
642 inst->conditional_mod = BRW_CONDITIONAL_NZ;
643 }
644 break;
645
646 case ir_unop_i2b:
647 if (intel->gen >= 6) {
648 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
649 } else {
650 inst = emit(MOV(dst_null_d(), op[0]));
651 inst->conditional_mod = BRW_CONDITIONAL_NZ;
652 }
653 break;
654
655 case ir_binop_all_equal:
656 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
657 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
658 break;
659
660 case ir_binop_any_nequal:
661 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
662 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
663 break;
664
665 case ir_unop_any:
666 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
667 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
668 break;
669
670 case ir_binop_greater:
671 case ir_binop_gequal:
672 case ir_binop_less:
673 case ir_binop_lequal:
674 case ir_binop_equal:
675 case ir_binop_nequal:
676 emit(CMP(dst_null_d(), op[0], op[1],
677 brw_conditional_for_comparison(expr->operation)));
678 break;
679
680 default:
681 assert(!"not reached");
682 break;
683 }
684 return;
685 }
686
687 ir->accept(this);
688
689 resolve_ud_negate(&this->result);
690
691 if (intel->gen >= 6) {
692 vec4_instruction *inst = emit(AND(dst_null_d(),
693 this->result, src_reg(1)));
694 inst->conditional_mod = BRW_CONDITIONAL_NZ;
695 } else {
696 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
697 inst->conditional_mod = BRW_CONDITIONAL_NZ;
698 }
699 }
700
701 /**
702 * Emit a gen6 IF statement with the comparison folded into the IF
703 * instruction.
704 */
705 void
emit_if_gen6(ir_if * ir)706 vec4_visitor::emit_if_gen6(ir_if *ir)
707 {
708 ir_expression *expr = ir->condition->as_expression();
709
710 if (expr) {
711 src_reg op[2];
712 dst_reg temp;
713
714 assert(expr->get_num_operands() <= 2);
715 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
716 expr->operands[i]->accept(this);
717 op[i] = this->result;
718 }
719
720 switch (expr->operation) {
721 case ir_unop_logic_not:
722 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
723 return;
724
725 case ir_binop_logic_xor:
726 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
727 return;
728
729 case ir_binop_logic_or:
730 temp = dst_reg(this, glsl_type::bool_type);
731 emit(OR(temp, op[0], op[1]));
732 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
733 return;
734
735 case ir_binop_logic_and:
736 temp = dst_reg(this, glsl_type::bool_type);
737 emit(AND(temp, op[0], op[1]));
738 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
739 return;
740
741 case ir_unop_f2b:
742 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
743 return;
744
745 case ir_unop_i2b:
746 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
747 return;
748
749 case ir_binop_greater:
750 case ir_binop_gequal:
751 case ir_binop_less:
752 case ir_binop_lequal:
753 case ir_binop_equal:
754 case ir_binop_nequal:
755 emit(IF(op[0], op[1],
756 brw_conditional_for_comparison(expr->operation)));
757 return;
758
759 case ir_binop_all_equal:
760 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
761 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
762 return;
763
764 case ir_binop_any_nequal:
765 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
766 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
767 return;
768
769 case ir_unop_any:
770 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
771 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
772 return;
773
774 default:
775 assert(!"not reached");
776 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
777 return;
778 }
779 return;
780 }
781
782 ir->condition->accept(this);
783
784 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
785 }
786
787 void
visit(ir_variable * ir)788 vec4_visitor::visit(ir_variable *ir)
789 {
790 dst_reg *reg = NULL;
791
792 if (variable_storage(ir))
793 return;
794
795 switch (ir->mode) {
796 case ir_var_in:
797 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
798
799 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
800 * come in as floating point conversions of the integer values.
801 */
802 for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
803 if (!c->key.gl_fixed_input_size[i])
804 continue;
805
806 dst_reg dst = *reg;
807 dst.type = brw_type_for_base_type(ir->type);
808 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
809 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
810 }
811 break;
812
813 case ir_var_out:
814 reg = new(mem_ctx) dst_reg(this, ir->type);
815
816 for (int i = 0; i < type_size(ir->type); i++) {
817 output_reg[ir->location + i] = *reg;
818 output_reg[ir->location + i].reg_offset = i;
819 output_reg[ir->location + i].type =
820 brw_type_for_base_type(ir->type->get_scalar_type());
821 output_reg_annotation[ir->location + i] = ir->name;
822 }
823 break;
824
825 case ir_var_auto:
826 case ir_var_temporary:
827 reg = new(mem_ctx) dst_reg(this, ir->type);
828 break;
829
830 case ir_var_uniform:
831 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
832
833 /* Thanks to the lower_ubo_reference pass, we will see only
834 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
835 * variables, so no need for them to be in variable_ht.
836 */
837 if (ir->uniform_block != -1)
838 return;
839
840 /* Track how big the whole uniform variable is, in case we need to put a
841 * copy of its data into pull constants for array access.
842 */
843 this->uniform_size[this->uniforms] = type_size(ir->type);
844
845 if (!strncmp(ir->name, "gl_", 3)) {
846 setup_builtin_uniform_values(ir);
847 } else {
848 setup_uniform_values(ir->location, ir->type);
849 }
850 break;
851
852 case ir_var_system_value:
853 /* VertexID is stored by the VF as the last vertex element, but
854 * we don't represent it with a flag in inputs_read, so we call
855 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
856 */
857 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
858 prog_data->uses_vertexid = true;
859
860 switch (ir->location) {
861 case SYSTEM_VALUE_VERTEX_ID:
862 reg->writemask = WRITEMASK_X;
863 break;
864 case SYSTEM_VALUE_INSTANCE_ID:
865 reg->writemask = WRITEMASK_Y;
866 break;
867 default:
868 assert(!"not reached");
869 break;
870 }
871 break;
872
873 default:
874 assert(!"not reached");
875 }
876
877 reg->type = brw_type_for_base_type(ir->type);
878 hash_table_insert(this->variable_ht, reg, ir);
879 }
880
881 void
visit(ir_loop * ir)882 vec4_visitor::visit(ir_loop *ir)
883 {
884 dst_reg counter;
885
886 /* We don't want debugging output to print the whole body of the
887 * loop as the annotation.
888 */
889 this->base_ir = NULL;
890
891 if (ir->counter != NULL) {
892 this->base_ir = ir->counter;
893 ir->counter->accept(this);
894 counter = *(variable_storage(ir->counter));
895
896 if (ir->from != NULL) {
897 this->base_ir = ir->from;
898 ir->from->accept(this);
899
900 emit(MOV(counter, this->result));
901 }
902 }
903
904 emit(BRW_OPCODE_DO);
905
906 if (ir->to) {
907 this->base_ir = ir->to;
908 ir->to->accept(this);
909
910 emit(CMP(dst_null_d(), src_reg(counter), this->result,
911 brw_conditional_for_comparison(ir->cmp)));
912
913 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
914 inst->predicate = BRW_PREDICATE_NORMAL;
915 }
916
917 visit_instructions(&ir->body_instructions);
918
919
920 if (ir->increment) {
921 this->base_ir = ir->increment;
922 ir->increment->accept(this);
923 emit(ADD(counter, src_reg(counter), this->result));
924 }
925
926 emit(BRW_OPCODE_WHILE);
927 }
928
929 void
visit(ir_loop_jump * ir)930 vec4_visitor::visit(ir_loop_jump *ir)
931 {
932 switch (ir->mode) {
933 case ir_loop_jump::jump_break:
934 emit(BRW_OPCODE_BREAK);
935 break;
936 case ir_loop_jump::jump_continue:
937 emit(BRW_OPCODE_CONTINUE);
938 break;
939 }
940 }
941
942
943 void
visit(ir_function_signature * ir)944 vec4_visitor::visit(ir_function_signature *ir)
945 {
946 assert(0);
947 (void)ir;
948 }
949
950 void
visit(ir_function * ir)951 vec4_visitor::visit(ir_function *ir)
952 {
953 /* Ignore function bodies other than main() -- we shouldn't see calls to
954 * them since they should all be inlined.
955 */
956 if (strcmp(ir->name, "main") == 0) {
957 const ir_function_signature *sig;
958 exec_list empty;
959
960 sig = ir->matching_signature(&empty);
961
962 assert(sig);
963
964 visit_instructions(&sig->body);
965 }
966 }
967
968 bool
try_emit_sat(ir_expression * ir)969 vec4_visitor::try_emit_sat(ir_expression *ir)
970 {
971 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
972 if (!sat_src)
973 return false;
974
975 sat_src->accept(this);
976 src_reg src = this->result;
977
978 this->result = src_reg(this, ir->type);
979 vec4_instruction *inst;
980 inst = emit(MOV(dst_reg(this->result), src));
981 inst->saturate = true;
982
983 return true;
984 }
985
986 void
emit_bool_comparison(unsigned int op,dst_reg dst,src_reg src0,src_reg src1)987 vec4_visitor::emit_bool_comparison(unsigned int op,
988 dst_reg dst, src_reg src0, src_reg src1)
989 {
990 /* original gen4 does destination conversion before comparison. */
991 if (intel->gen < 5)
992 dst.type = src0.type;
993
994 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
995
996 dst.type = BRW_REGISTER_TYPE_D;
997 emit(AND(dst, src_reg(dst), src_reg(0x1)));
998 }
999
1000 void
visit(ir_expression * ir)1001 vec4_visitor::visit(ir_expression *ir)
1002 {
1003 unsigned int operand;
1004 src_reg op[Elements(ir->operands)];
1005 src_reg result_src;
1006 dst_reg result_dst;
1007 vec4_instruction *inst;
1008
1009 if (try_emit_sat(ir))
1010 return;
1011
1012 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1013 this->result.file = BAD_FILE;
1014 ir->operands[operand]->accept(this);
1015 if (this->result.file == BAD_FILE) {
1016 printf("Failed to get tree for expression operand:\n");
1017 ir->operands[operand]->print();
1018 exit(1);
1019 }
1020 op[operand] = this->result;
1021
1022 /* Matrix expression operands should have been broken down to vector
1023 * operations already.
1024 */
1025 assert(!ir->operands[operand]->type->is_matrix());
1026 }
1027
1028 int vector_elements = ir->operands[0]->type->vector_elements;
1029 if (ir->operands[1]) {
1030 vector_elements = MAX2(vector_elements,
1031 ir->operands[1]->type->vector_elements);
1032 }
1033
1034 this->result.file = BAD_FILE;
1035
1036 /* Storage for our result. Ideally for an assignment we'd be using
1037 * the actual storage for the result here, instead.
1038 */
1039 result_src = src_reg(this, ir->type);
1040 /* convenience for the emit functions below. */
1041 result_dst = dst_reg(result_src);
1042 /* If nothing special happens, this is the result. */
1043 this->result = result_src;
1044 /* Limit writes to the channels that will be used by result_src later.
1045 * This does limit this temp's use as a temporary for multi-instruction
1046 * sequences.
1047 */
1048 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1049
1050 switch (ir->operation) {
1051 case ir_unop_logic_not:
1052 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1053 * ones complement of the whole register, not just bit 0.
1054 */
1055 emit(XOR(result_dst, op[0], src_reg(1)));
1056 break;
1057 case ir_unop_neg:
1058 op[0].negate = !op[0].negate;
1059 this->result = op[0];
1060 break;
1061 case ir_unop_abs:
1062 op[0].abs = true;
1063 op[0].negate = false;
1064 this->result = op[0];
1065 break;
1066
1067 case ir_unop_sign:
1068 emit(MOV(result_dst, src_reg(0.0f)));
1069
1070 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1071 inst = emit(MOV(result_dst, src_reg(1.0f)));
1072 inst->predicate = BRW_PREDICATE_NORMAL;
1073
1074 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1075 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1076 inst->predicate = BRW_PREDICATE_NORMAL;
1077
1078 break;
1079
1080 case ir_unop_rcp:
1081 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1082 break;
1083
1084 case ir_unop_exp2:
1085 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1086 break;
1087 case ir_unop_log2:
1088 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1089 break;
1090 case ir_unop_exp:
1091 case ir_unop_log:
1092 assert(!"not reached: should be handled by ir_explog_to_explog2");
1093 break;
1094 case ir_unop_sin:
1095 case ir_unop_sin_reduced:
1096 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1097 break;
1098 case ir_unop_cos:
1099 case ir_unop_cos_reduced:
1100 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1101 break;
1102
1103 case ir_unop_dFdx:
1104 case ir_unop_dFdy:
1105 assert(!"derivatives not valid in vertex shader");
1106 break;
1107
1108 case ir_unop_noise:
1109 assert(!"not reached: should be handled by lower_noise");
1110 break;
1111
1112 case ir_binop_add:
1113 emit(ADD(result_dst, op[0], op[1]));
1114 break;
1115 case ir_binop_sub:
1116 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1117 break;
1118
1119 case ir_binop_mul:
1120 if (ir->type->is_integer()) {
1121 /* For integer multiplication, the MUL uses the low 16 bits
1122 * of one of the operands (src0 on gen6, src1 on gen7). The
1123 * MACH accumulates in the contribution of the upper 16 bits
1124 * of that operand.
1125 *
1126 * FINISHME: Emit just the MUL if we know an operand is small
1127 * enough.
1128 */
1129 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1130
1131 emit(MUL(acc, op[0], op[1]));
1132 emit(MACH(dst_null_d(), op[0], op[1]));
1133 emit(MOV(result_dst, src_reg(acc)));
1134 } else {
1135 emit(MUL(result_dst, op[0], op[1]));
1136 }
1137 break;
1138 case ir_binop_div:
1139 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1140 assert(ir->type->is_integer());
1141 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1142 break;
1143 case ir_binop_mod:
1144 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1145 assert(ir->type->is_integer());
1146 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1147 break;
1148
1149 case ir_binop_less:
1150 case ir_binop_greater:
1151 case ir_binop_lequal:
1152 case ir_binop_gequal:
1153 case ir_binop_equal:
1154 case ir_binop_nequal: {
1155 emit(CMP(result_dst, op[0], op[1],
1156 brw_conditional_for_comparison(ir->operation)));
1157 emit(AND(result_dst, result_src, src_reg(0x1)));
1158 break;
1159 }
1160
1161 case ir_binop_all_equal:
1162 /* "==" operator producing a scalar boolean. */
1163 if (ir->operands[0]->type->is_vector() ||
1164 ir->operands[1]->type->is_vector()) {
1165 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1166 emit(MOV(result_dst, src_reg(0)));
1167 inst = emit(MOV(result_dst, src_reg(1)));
1168 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1169 } else {
1170 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1171 emit(AND(result_dst, result_src, src_reg(0x1)));
1172 }
1173 break;
1174 case ir_binop_any_nequal:
1175 /* "!=" operator producing a scalar boolean. */
1176 if (ir->operands[0]->type->is_vector() ||
1177 ir->operands[1]->type->is_vector()) {
1178 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1179
1180 emit(MOV(result_dst, src_reg(0)));
1181 inst = emit(MOV(result_dst, src_reg(1)));
1182 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1183 } else {
1184 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1185 emit(AND(result_dst, result_src, src_reg(0x1)));
1186 }
1187 break;
1188
1189 case ir_unop_any:
1190 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1191 emit(MOV(result_dst, src_reg(0)));
1192
1193 inst = emit(MOV(result_dst, src_reg(1)));
1194 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1195 break;
1196
1197 case ir_binop_logic_xor:
1198 emit(XOR(result_dst, op[0], op[1]));
1199 break;
1200
1201 case ir_binop_logic_or:
1202 emit(OR(result_dst, op[0], op[1]));
1203 break;
1204
1205 case ir_binop_logic_and:
1206 emit(AND(result_dst, op[0], op[1]));
1207 break;
1208
1209 case ir_binop_dot:
1210 assert(ir->operands[0]->type->is_vector());
1211 assert(ir->operands[0]->type == ir->operands[1]->type);
1212 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1213 break;
1214
1215 case ir_unop_sqrt:
1216 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1217 break;
1218 case ir_unop_rsq:
1219 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1220 break;
1221
1222 case ir_unop_bitcast_i2f:
1223 case ir_unop_bitcast_u2f:
1224 this->result = op[0];
1225 this->result.type = BRW_REGISTER_TYPE_F;
1226 break;
1227
1228 case ir_unop_bitcast_f2i:
1229 this->result = op[0];
1230 this->result.type = BRW_REGISTER_TYPE_D;
1231 break;
1232
1233 case ir_unop_bitcast_f2u:
1234 this->result = op[0];
1235 this->result.type = BRW_REGISTER_TYPE_UD;
1236 break;
1237
1238 case ir_unop_i2f:
1239 case ir_unop_i2u:
1240 case ir_unop_u2i:
1241 case ir_unop_u2f:
1242 case ir_unop_b2f:
1243 case ir_unop_b2i:
1244 case ir_unop_f2i:
1245 case ir_unop_f2u:
1246 emit(MOV(result_dst, op[0]));
1247 break;
1248 case ir_unop_f2b:
1249 case ir_unop_i2b: {
1250 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1251 emit(AND(result_dst, result_src, src_reg(1)));
1252 break;
1253 }
1254
1255 case ir_unop_trunc:
1256 emit(RNDZ(result_dst, op[0]));
1257 break;
1258 case ir_unop_ceil:
1259 op[0].negate = !op[0].negate;
1260 inst = emit(RNDD(result_dst, op[0]));
1261 this->result.negate = true;
1262 break;
1263 case ir_unop_floor:
1264 inst = emit(RNDD(result_dst, op[0]));
1265 break;
1266 case ir_unop_fract:
1267 inst = emit(FRC(result_dst, op[0]));
1268 break;
1269 case ir_unop_round_even:
1270 emit(RNDE(result_dst, op[0]));
1271 break;
1272
1273 case ir_binop_min:
1274 if (intel->gen >= 6) {
1275 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1276 inst->conditional_mod = BRW_CONDITIONAL_L;
1277 } else {
1278 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1279
1280 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1281 inst->predicate = BRW_PREDICATE_NORMAL;
1282 }
1283 break;
1284 case ir_binop_max:
1285 if (intel->gen >= 6) {
1286 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1287 inst->conditional_mod = BRW_CONDITIONAL_G;
1288 } else {
1289 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1290
1291 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1292 inst->predicate = BRW_PREDICATE_NORMAL;
1293 }
1294 break;
1295
1296 case ir_binop_pow:
1297 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1298 break;
1299
1300 case ir_unop_bit_not:
1301 inst = emit(NOT(result_dst, op[0]));
1302 break;
1303 case ir_binop_bit_and:
1304 inst = emit(AND(result_dst, op[0], op[1]));
1305 break;
1306 case ir_binop_bit_xor:
1307 inst = emit(XOR(result_dst, op[0], op[1]));
1308 break;
1309 case ir_binop_bit_or:
1310 inst = emit(OR(result_dst, op[0], op[1]));
1311 break;
1312
1313 case ir_binop_lshift:
1314 inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1315 break;
1316
1317 case ir_binop_rshift:
1318 if (ir->type->base_type == GLSL_TYPE_INT)
1319 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1320 else
1321 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1322 break;
1323
1324 case ir_binop_ubo_load: {
1325 ir_constant *uniform_block = ir->operands[0]->as_constant();
1326 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1327 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1328 src_reg offset = op[1];
1329
1330 /* Now, load the vector from that offset. */
1331 assert(ir->type->is_vector() || ir->type->is_scalar());
1332
1333 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1334 packed_consts.type = result.type;
1335 src_reg surf_index =
1336 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1337 if (const_offset_ir) {
1338 offset = src_reg(const_offset / 16);
1339 } else {
1340 emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1341 }
1342
1343 vec4_instruction *pull =
1344 emit(new(mem_ctx) vec4_instruction(this,
1345 VS_OPCODE_PULL_CONSTANT_LOAD,
1346 dst_reg(packed_consts),
1347 surf_index,
1348 offset));
1349 pull->base_mrf = 14;
1350 pull->mlen = 1;
1351
1352 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1353 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1354 const_offset % 16 / 4,
1355 const_offset % 16 / 4,
1356 const_offset % 16 / 4);
1357
1358 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1359 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1360 emit(CMP(result_dst, packed_consts, src_reg(0u),
1361 BRW_CONDITIONAL_NZ));
1362 emit(AND(result_dst, result, src_reg(0x1)));
1363 } else {
1364 emit(MOV(result_dst, packed_consts));
1365 }
1366 break;
1367 }
1368
1369 case ir_quadop_vector:
1370 assert(!"not reached: should be handled by lower_quadop_vector");
1371 break;
1372 }
1373 }
1374
1375
1376 void
visit(ir_swizzle * ir)1377 vec4_visitor::visit(ir_swizzle *ir)
1378 {
1379 src_reg src;
1380 int i = 0;
1381 int swizzle[4];
1382
1383 /* Note that this is only swizzles in expressions, not those on the left
1384 * hand side of an assignment, which do write masking. See ir_assignment
1385 * for that.
1386 */
1387
1388 ir->val->accept(this);
1389 src = this->result;
1390 assert(src.file != BAD_FILE);
1391
1392 for (i = 0; i < ir->type->vector_elements; i++) {
1393 switch (i) {
1394 case 0:
1395 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1396 break;
1397 case 1:
1398 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1399 break;
1400 case 2:
1401 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1402 break;
1403 case 3:
1404 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1405 break;
1406 }
1407 }
1408 for (; i < 4; i++) {
1409 /* Replicate the last channel out. */
1410 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1411 }
1412
1413 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1414
1415 this->result = src;
1416 }
1417
1418 void
visit(ir_dereference_variable * ir)1419 vec4_visitor::visit(ir_dereference_variable *ir)
1420 {
1421 const struct glsl_type *type = ir->type;
1422 dst_reg *reg = variable_storage(ir->var);
1423
1424 if (!reg) {
1425 fail("Failed to find variable storage for %s\n", ir->var->name);
1426 this->result = src_reg(brw_null_reg());
1427 return;
1428 }
1429
1430 this->result = src_reg(*reg);
1431
1432 /* System values get their swizzle from the dst_reg writemask */
1433 if (ir->var->mode == ir_var_system_value)
1434 return;
1435
1436 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1437 this->result.swizzle = swizzle_for_size(type->vector_elements);
1438 }
1439
1440 void
visit(ir_dereference_array * ir)1441 vec4_visitor::visit(ir_dereference_array *ir)
1442 {
1443 ir_constant *constant_index;
1444 src_reg src;
1445 int element_size = type_size(ir->type);
1446
1447 constant_index = ir->array_index->constant_expression_value();
1448
1449 ir->array->accept(this);
1450 src = this->result;
1451
1452 if (constant_index) {
1453 src.reg_offset += constant_index->value.i[0] * element_size;
1454 } else {
1455 /* Variable index array dereference. It eats the "vec4" of the
1456 * base of the array and an index that offsets the Mesa register
1457 * index.
1458 */
1459 ir->array_index->accept(this);
1460
1461 src_reg index_reg;
1462
1463 if (element_size == 1) {
1464 index_reg = this->result;
1465 } else {
1466 index_reg = src_reg(this, glsl_type::int_type);
1467
1468 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1469 }
1470
1471 if (src.reladdr) {
1472 src_reg temp = src_reg(this, glsl_type::int_type);
1473
1474 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1475
1476 index_reg = temp;
1477 }
1478
1479 src.reladdr = ralloc(mem_ctx, src_reg);
1480 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1481 }
1482
1483 /* If the type is smaller than a vec4, replicate the last channel out. */
1484 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1485 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1486 else
1487 src.swizzle = BRW_SWIZZLE_NOOP;
1488 src.type = brw_type_for_base_type(ir->type);
1489
1490 this->result = src;
1491 }
1492
1493 void
visit(ir_dereference_record * ir)1494 vec4_visitor::visit(ir_dereference_record *ir)
1495 {
1496 unsigned int i;
1497 const glsl_type *struct_type = ir->record->type;
1498 int offset = 0;
1499
1500 ir->record->accept(this);
1501
1502 for (i = 0; i < struct_type->length; i++) {
1503 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1504 break;
1505 offset += type_size(struct_type->fields.structure[i].type);
1506 }
1507
1508 /* If the type is smaller than a vec4, replicate the last channel out. */
1509 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1510 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1511 else
1512 this->result.swizzle = BRW_SWIZZLE_NOOP;
1513 this->result.type = brw_type_for_base_type(ir->type);
1514
1515 this->result.reg_offset += offset;
1516 }
1517
1518 /**
1519 * We want to be careful in assignment setup to hit the actual storage
1520 * instead of potentially using a temporary like we might with the
1521 * ir_dereference handler.
1522 */
1523 static dst_reg
get_assignment_lhs(ir_dereference * ir,vec4_visitor * v)1524 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1525 {
1526 /* The LHS must be a dereference. If the LHS is a variable indexed array
1527 * access of a vector, it must be separated into a series conditional moves
1528 * before reaching this point (see ir_vec_index_to_cond_assign).
1529 */
1530 assert(ir->as_dereference());
1531 ir_dereference_array *deref_array = ir->as_dereference_array();
1532 if (deref_array) {
1533 assert(!deref_array->array->type->is_vector());
1534 }
1535
1536 /* Use the rvalue deref handler for the most part. We'll ignore
1537 * swizzles in it and write swizzles using writemask, though.
1538 */
1539 ir->accept(v);
1540 return dst_reg(v->result);
1541 }
1542
1543 void
emit_block_move(dst_reg * dst,src_reg * src,const struct glsl_type * type,uint32_t predicate)1544 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1545 const struct glsl_type *type, uint32_t predicate)
1546 {
1547 if (type->base_type == GLSL_TYPE_STRUCT) {
1548 for (unsigned int i = 0; i < type->length; i++) {
1549 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1550 }
1551 return;
1552 }
1553
1554 if (type->is_array()) {
1555 for (unsigned int i = 0; i < type->length; i++) {
1556 emit_block_move(dst, src, type->fields.array, predicate);
1557 }
1558 return;
1559 }
1560
1561 if (type->is_matrix()) {
1562 const struct glsl_type *vec_type;
1563
1564 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1565 type->vector_elements, 1);
1566
1567 for (int i = 0; i < type->matrix_columns; i++) {
1568 emit_block_move(dst, src, vec_type, predicate);
1569 }
1570 return;
1571 }
1572
1573 assert(type->is_scalar() || type->is_vector());
1574
1575 dst->type = brw_type_for_base_type(type);
1576 src->type = dst->type;
1577
1578 dst->writemask = (1 << type->vector_elements) - 1;
1579
1580 src->swizzle = swizzle_for_size(type->vector_elements);
1581
1582 vec4_instruction *inst = emit(MOV(*dst, *src));
1583 inst->predicate = predicate;
1584
1585 dst->reg_offset++;
1586 src->reg_offset++;
1587 }
1588
1589
1590 /* If the RHS processing resulted in an instruction generating a
1591 * temporary value, and it would be easy to rewrite the instruction to
1592 * generate its result right into the LHS instead, do so. This ends
1593 * up reliably removing instructions where it can be tricky to do so
1594 * later without real UD chain information.
1595 */
1596 bool
try_rewrite_rhs_to_dst(ir_assignment * ir,dst_reg dst,src_reg src,vec4_instruction * pre_rhs_inst,vec4_instruction * last_rhs_inst)1597 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1598 dst_reg dst,
1599 src_reg src,
1600 vec4_instruction *pre_rhs_inst,
1601 vec4_instruction *last_rhs_inst)
1602 {
1603 /* This could be supported, but it would take more smarts. */
1604 if (ir->condition)
1605 return false;
1606
1607 if (pre_rhs_inst == last_rhs_inst)
1608 return false; /* No instructions generated to work with. */
1609
1610 /* Make sure the last instruction generated our source reg. */
1611 if (src.file != GRF ||
1612 src.file != last_rhs_inst->dst.file ||
1613 src.reg != last_rhs_inst->dst.reg ||
1614 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1615 src.reladdr ||
1616 src.abs ||
1617 src.negate ||
1618 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1619 return false;
1620
1621 /* Check that that last instruction fully initialized the channels
1622 * we want to use, in the order we want to use them. We could
1623 * potentially reswizzle the operands of many instructions so that
1624 * we could handle out of order channels, but don't yet.
1625 */
1626
1627 for (unsigned i = 0; i < 4; i++) {
1628 if (dst.writemask & (1 << i)) {
1629 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1630 return false;
1631
1632 if (BRW_GET_SWZ(src.swizzle, i) != i)
1633 return false;
1634 }
1635 }
1636
1637 /* Success! Rewrite the instruction. */
1638 last_rhs_inst->dst.file = dst.file;
1639 last_rhs_inst->dst.reg = dst.reg;
1640 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1641 last_rhs_inst->dst.reladdr = dst.reladdr;
1642 last_rhs_inst->dst.writemask &= dst.writemask;
1643
1644 return true;
1645 }
1646
1647 void
visit(ir_assignment * ir)1648 vec4_visitor::visit(ir_assignment *ir)
1649 {
1650 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1651 uint32_t predicate = BRW_PREDICATE_NONE;
1652
1653 if (!ir->lhs->type->is_scalar() &&
1654 !ir->lhs->type->is_vector()) {
1655 ir->rhs->accept(this);
1656 src_reg src = this->result;
1657
1658 if (ir->condition) {
1659 emit_bool_to_cond_code(ir->condition, &predicate);
1660 }
1661
1662 /* emit_block_move doesn't account for swizzles in the source register.
1663 * This should be ok, since the source register is a structure or an
1664 * array, and those can't be swizzled. But double-check to be sure.
1665 */
1666 assert(src.swizzle ==
1667 (ir->rhs->type->is_matrix()
1668 ? swizzle_for_size(ir->rhs->type->vector_elements)
1669 : BRW_SWIZZLE_NOOP));
1670
1671 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1672 return;
1673 }
1674
1675 /* Now we're down to just a scalar/vector with writemasks. */
1676 int i;
1677
1678 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1679 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1680
1681 ir->rhs->accept(this);
1682
1683 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1684
1685 src_reg src = this->result;
1686
1687 int swizzles[4];
1688 int first_enabled_chan = 0;
1689 int src_chan = 0;
1690
1691 assert(ir->lhs->type->is_vector() ||
1692 ir->lhs->type->is_scalar());
1693 dst.writemask = ir->write_mask;
1694
1695 for (int i = 0; i < 4; i++) {
1696 if (dst.writemask & (1 << i)) {
1697 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1698 break;
1699 }
1700 }
1701
1702 /* Swizzle a small RHS vector into the channels being written.
1703 *
1704 * glsl ir treats write_mask as dictating how many channels are
1705 * present on the RHS while in our instructions we need to make
1706 * those channels appear in the slots of the vec4 they're written to.
1707 */
1708 for (int i = 0; i < 4; i++) {
1709 if (dst.writemask & (1 << i))
1710 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1711 else
1712 swizzles[i] = first_enabled_chan;
1713 }
1714 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1715 swizzles[2], swizzles[3]);
1716
1717 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1718 return;
1719 }
1720
1721 if (ir->condition) {
1722 emit_bool_to_cond_code(ir->condition, &predicate);
1723 }
1724
1725 for (i = 0; i < type_size(ir->lhs->type); i++) {
1726 vec4_instruction *inst = emit(MOV(dst, src));
1727 inst->predicate = predicate;
1728
1729 dst.reg_offset++;
1730 src.reg_offset++;
1731 }
1732 }
1733
1734 void
emit_constant_values(dst_reg * dst,ir_constant * ir)1735 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1736 {
1737 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1738 foreach_list(node, &ir->components) {
1739 ir_constant *field_value = (ir_constant *)node;
1740
1741 emit_constant_values(dst, field_value);
1742 }
1743 return;
1744 }
1745
1746 if (ir->type->is_array()) {
1747 for (unsigned int i = 0; i < ir->type->length; i++) {
1748 emit_constant_values(dst, ir->array_elements[i]);
1749 }
1750 return;
1751 }
1752
1753 if (ir->type->is_matrix()) {
1754 for (int i = 0; i < ir->type->matrix_columns; i++) {
1755 float *vec = &ir->value.f[i * ir->type->vector_elements];
1756
1757 for (int j = 0; j < ir->type->vector_elements; j++) {
1758 dst->writemask = 1 << j;
1759 dst->type = BRW_REGISTER_TYPE_F;
1760
1761 emit(MOV(*dst, src_reg(vec[j])));
1762 }
1763 dst->reg_offset++;
1764 }
1765 return;
1766 }
1767
1768 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1769
1770 for (int i = 0; i < ir->type->vector_elements; i++) {
1771 if (!(remaining_writemask & (1 << i)))
1772 continue;
1773
1774 dst->writemask = 1 << i;
1775 dst->type = brw_type_for_base_type(ir->type);
1776
1777 /* Find other components that match the one we're about to
1778 * write. Emits fewer instructions for things like vec4(0.5,
1779 * 1.5, 1.5, 1.5).
1780 */
1781 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1782 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1783 if (ir->value.b[i] == ir->value.b[j])
1784 dst->writemask |= (1 << j);
1785 } else {
1786 /* u, i, and f storage all line up, so no need for a
1787 * switch case for comparing each type.
1788 */
1789 if (ir->value.u[i] == ir->value.u[j])
1790 dst->writemask |= (1 << j);
1791 }
1792 }
1793
1794 switch (ir->type->base_type) {
1795 case GLSL_TYPE_FLOAT:
1796 emit(MOV(*dst, src_reg(ir->value.f[i])));
1797 break;
1798 case GLSL_TYPE_INT:
1799 emit(MOV(*dst, src_reg(ir->value.i[i])));
1800 break;
1801 case GLSL_TYPE_UINT:
1802 emit(MOV(*dst, src_reg(ir->value.u[i])));
1803 break;
1804 case GLSL_TYPE_BOOL:
1805 emit(MOV(*dst, src_reg(ir->value.b[i])));
1806 break;
1807 default:
1808 assert(!"Non-float/uint/int/bool constant");
1809 break;
1810 }
1811
1812 remaining_writemask &= ~dst->writemask;
1813 }
1814 dst->reg_offset++;
1815 }
1816
1817 void
visit(ir_constant * ir)1818 vec4_visitor::visit(ir_constant *ir)
1819 {
1820 dst_reg dst = dst_reg(this, ir->type);
1821 this->result = src_reg(dst);
1822
1823 emit_constant_values(&dst, ir);
1824 }
1825
1826 void
visit(ir_call * ir)1827 vec4_visitor::visit(ir_call *ir)
1828 {
1829 assert(!"not reached");
1830 }
1831
1832 void
visit(ir_texture * ir)1833 vec4_visitor::visit(ir_texture *ir)
1834 {
1835 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1836
1837 /* Should be lowered by do_lower_texture_projection */
1838 assert(!ir->projector);
1839
1840 /* Generate code to compute all the subexpression trees. This has to be
1841 * done before loading any values into MRFs for the sampler message since
1842 * generating these values may involve SEND messages that need the MRFs.
1843 */
1844 src_reg coordinate;
1845 if (ir->coordinate) {
1846 ir->coordinate->accept(this);
1847 coordinate = this->result;
1848 }
1849
1850 src_reg shadow_comparitor;
1851 if (ir->shadow_comparitor) {
1852 ir->shadow_comparitor->accept(this);
1853 shadow_comparitor = this->result;
1854 }
1855
1856 const glsl_type *lod_type;
1857 src_reg lod, dPdx, dPdy;
1858 switch (ir->op) {
1859 case ir_tex:
1860 lod = src_reg(0.0f);
1861 lod_type = glsl_type::float_type;
1862 break;
1863 case ir_txf:
1864 case ir_txl:
1865 case ir_txs:
1866 ir->lod_info.lod->accept(this);
1867 lod = this->result;
1868 lod_type = ir->lod_info.lod->type;
1869 break;
1870 case ir_txd:
1871 ir->lod_info.grad.dPdx->accept(this);
1872 dPdx = this->result;
1873
1874 ir->lod_info.grad.dPdy->accept(this);
1875 dPdy = this->result;
1876
1877 lod_type = ir->lod_info.grad.dPdx->type;
1878 break;
1879 case ir_txb:
1880 break;
1881 }
1882
1883 vec4_instruction *inst = NULL;
1884 switch (ir->op) {
1885 case ir_tex:
1886 case ir_txl:
1887 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1888 break;
1889 case ir_txd:
1890 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1891 break;
1892 case ir_txf:
1893 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1894 break;
1895 case ir_txs:
1896 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1897 break;
1898 case ir_txb:
1899 assert(!"TXB is not valid for vertex shaders.");
1900 }
1901
1902 /* Texel offsets go in the message header; Gen4 also requires headers. */
1903 inst->header_present = ir->offset || intel->gen < 5;
1904 inst->base_mrf = 2;
1905 inst->mlen = inst->header_present + 1; /* always at least one */
1906 inst->sampler = sampler;
1907 inst->dst = dst_reg(this, ir->type);
1908 inst->dst.writemask = WRITEMASK_XYZW;
1909 inst->shadow_compare = ir->shadow_comparitor != NULL;
1910
1911 if (ir->offset != NULL && ir->op != ir_txf)
1912 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1913
1914 /* MRF for the first parameter */
1915 int param_base = inst->base_mrf + inst->header_present;
1916
1917 if (ir->op == ir_txs) {
1918 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1919 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
1920 } else {
1921 int i, coord_mask = 0, zero_mask = 0;
1922 /* Load the coordinate */
1923 /* FINISHME: gl_clamp_mask and saturate */
1924 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1925 coord_mask |= (1 << i);
1926 for (; i < 4; i++)
1927 zero_mask |= (1 << i);
1928
1929 if (ir->offset && ir->op == ir_txf) {
1930 /* It appears that the ld instruction used for txf does its
1931 * address bounds check before adding in the offset. To work
1932 * around this, just add the integer offset to the integer
1933 * texel coordinate, and don't put the offset in the header.
1934 */
1935 ir_constant *offset = ir->offset->as_constant();
1936 assert(offset);
1937
1938 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1939 src_reg src = coordinate;
1940 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1941 BRW_GET_SWZ(src.swizzle, j),
1942 BRW_GET_SWZ(src.swizzle, j),
1943 BRW_GET_SWZ(src.swizzle, j));
1944 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1945 src, offset->value.i[j]));
1946 }
1947 } else {
1948 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1949 coordinate));
1950 }
1951 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1952 src_reg(0)));
1953 /* Load the shadow comparitor */
1954 if (ir->shadow_comparitor) {
1955 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1956 WRITEMASK_X),
1957 shadow_comparitor));
1958 inst->mlen++;
1959 }
1960
1961 /* Load the LOD info */
1962 if (ir->op == ir_tex || ir->op == ir_txl) {
1963 int mrf, writemask;
1964 if (intel->gen >= 5) {
1965 mrf = param_base + 1;
1966 if (ir->shadow_comparitor) {
1967 writemask = WRITEMASK_Y;
1968 /* mlen already incremented */
1969 } else {
1970 writemask = WRITEMASK_X;
1971 inst->mlen++;
1972 }
1973 } else /* intel->gen == 4 */ {
1974 mrf = param_base;
1975 writemask = WRITEMASK_Z;
1976 }
1977 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
1978 } else if (ir->op == ir_txf) {
1979 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
1980 lod));
1981 } else if (ir->op == ir_txd) {
1982 const glsl_type *type = lod_type;
1983
1984 if (intel->gen >= 5) {
1985 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1986 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1987 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1988 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1989 inst->mlen++;
1990
1991 if (ir->type->vector_elements == 3) {
1992 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1993 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1994 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1995 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1996 inst->mlen++;
1997 }
1998 } else /* intel->gen == 4 */ {
1999 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2000 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2001 inst->mlen += 2;
2002 }
2003 }
2004 }
2005
2006 emit(inst);
2007
2008 swizzle_result(ir, src_reg(inst->dst), sampler);
2009 }
2010
2011 void
swizzle_result(ir_texture * ir,src_reg orig_val,int sampler)2012 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2013 {
2014 int s = c->key.tex.swizzles[sampler];
2015
2016 this->result = src_reg(this, ir->type);
2017 dst_reg swizzled_result(this->result);
2018
2019 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2020 || s == SWIZZLE_NOOP) {
2021 emit(MOV(swizzled_result, orig_val));
2022 return;
2023 }
2024
2025 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2026 int swizzle[4];
2027
2028 for (int i = 0; i < 4; i++) {
2029 switch (GET_SWZ(s, i)) {
2030 case SWIZZLE_ZERO:
2031 zero_mask |= (1 << i);
2032 break;
2033 case SWIZZLE_ONE:
2034 one_mask |= (1 << i);
2035 break;
2036 default:
2037 copy_mask |= (1 << i);
2038 swizzle[i] = GET_SWZ(s, i);
2039 break;
2040 }
2041 }
2042
2043 if (copy_mask) {
2044 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2045 swizzled_result.writemask = copy_mask;
2046 emit(MOV(swizzled_result, orig_val));
2047 }
2048
2049 if (zero_mask) {
2050 swizzled_result.writemask = zero_mask;
2051 emit(MOV(swizzled_result, src_reg(0.0f)));
2052 }
2053
2054 if (one_mask) {
2055 swizzled_result.writemask = one_mask;
2056 emit(MOV(swizzled_result, src_reg(1.0f)));
2057 }
2058 }
2059
2060 void
visit(ir_return * ir)2061 vec4_visitor::visit(ir_return *ir)
2062 {
2063 assert(!"not reached");
2064 }
2065
2066 void
visit(ir_discard * ir)2067 vec4_visitor::visit(ir_discard *ir)
2068 {
2069 assert(!"not reached");
2070 }
2071
2072 void
visit(ir_if * ir)2073 vec4_visitor::visit(ir_if *ir)
2074 {
2075 /* Don't point the annotation at the if statement, because then it plus
2076 * the then and else blocks get printed.
2077 */
2078 this->base_ir = ir->condition;
2079
2080 if (intel->gen == 6) {
2081 emit_if_gen6(ir);
2082 } else {
2083 uint32_t predicate;
2084 emit_bool_to_cond_code(ir->condition, &predicate);
2085 emit(IF(predicate));
2086 }
2087
2088 visit_instructions(&ir->then_instructions);
2089
2090 if (!ir->else_instructions.is_empty()) {
2091 this->base_ir = ir->condition;
2092 emit(BRW_OPCODE_ELSE);
2093
2094 visit_instructions(&ir->else_instructions);
2095 }
2096
2097 this->base_ir = ir->condition;
2098 emit(BRW_OPCODE_ENDIF);
2099 }
2100
2101 void
emit_ndc_computation()2102 vec4_visitor::emit_ndc_computation()
2103 {
2104 /* Get the position */
2105 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2106
2107 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2108 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2109 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2110
2111 current_annotation = "NDC";
2112 dst_reg ndc_w = ndc;
2113 ndc_w.writemask = WRITEMASK_W;
2114 src_reg pos_w = pos;
2115 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2116 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2117
2118 dst_reg ndc_xyz = ndc;
2119 ndc_xyz.writemask = WRITEMASK_XYZ;
2120
2121 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2122 }
2123
2124 void
emit_psiz_and_flags(struct brw_reg reg)2125 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2126 {
2127 if (intel->gen < 6 &&
2128 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2129 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2130 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2131 dst_reg header1_w = header1;
2132 header1_w.writemask = WRITEMASK_W;
2133 GLuint i;
2134
2135 emit(MOV(header1, 0u));
2136
2137 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2138 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2139
2140 current_annotation = "Point size";
2141 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2142 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2143 }
2144
2145 current_annotation = "Clipping flags";
2146 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2147 vec4_instruction *inst;
2148
2149 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2150 src_reg(this->userplane[i])));
2151 inst->conditional_mod = BRW_CONDITIONAL_L;
2152
2153 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2154 inst->predicate = BRW_PREDICATE_NORMAL;
2155 }
2156
2157 /* i965 clipping workaround:
2158 * 1) Test for -ve rhw
2159 * 2) If set,
2160 * set ndc = (0,0,0,0)
2161 * set ucp[6] = 1
2162 *
2163 * Later, clipping will detect ucp[6] and ensure the primitive is
2164 * clipped against all fixed planes.
2165 */
2166 if (brw->has_negative_rhw_bug) {
2167 #if 0
2168 /* FINISHME */
2169 brw_CMP(p,
2170 vec8(brw_null_reg()),
2171 BRW_CONDITIONAL_L,
2172 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2173 brw_imm_f(0));
2174
2175 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2176 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2177 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2178 #endif
2179 }
2180
2181 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2182 } else if (intel->gen < 6) {
2183 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2184 } else {
2185 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2186 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2187 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2188 src_reg(output_reg[VERT_RESULT_PSIZ])));
2189 }
2190 }
2191 }
2192
2193 void
emit_clip_distances(struct brw_reg reg,int offset)2194 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2195 {
2196 if (intel->gen < 6) {
2197 /* Clip distance slots are set aside in gen5, but they are not used. It
2198 * is not clear whether we actually need to set aside space for them,
2199 * but the performance cost is negligible.
2200 */
2201 return;
2202 }
2203
2204 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2205 *
2206 * "If a linked set of shaders forming the vertex stage contains no
2207 * static write to gl_ClipVertex or gl_ClipDistance, but the
2208 * application has requested clipping against user clip planes through
2209 * the API, then the coordinate written to gl_Position is used for
2210 * comparison against the user clip planes."
2211 *
2212 * This function is only called if the shader didn't write to
2213 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2214 * if the user wrote to it; otherwise we use gl_Position.
2215 */
2216 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2217 if (!(c->prog_data.outputs_written
2218 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2219 clip_vertex = VERT_RESULT_HPOS;
2220 }
2221
2222 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2223 ++i) {
2224 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2225 src_reg(output_reg[clip_vertex]),
2226 src_reg(this->userplane[i + offset])));
2227 }
2228 }
2229
2230 void
emit_generic_urb_slot(dst_reg reg,int vert_result)2231 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2232 {
2233 assert (vert_result < VERT_RESULT_MAX);
2234 reg.type = output_reg[vert_result].type;
2235 current_annotation = output_reg_annotation[vert_result];
2236 /* Copy the register, saturating if necessary */
2237 vec4_instruction *inst = emit(MOV(reg,
2238 src_reg(output_reg[vert_result])));
2239 if ((vert_result == VERT_RESULT_COL0 ||
2240 vert_result == VERT_RESULT_COL1 ||
2241 vert_result == VERT_RESULT_BFC0 ||
2242 vert_result == VERT_RESULT_BFC1) &&
2243 c->key.clamp_vertex_color) {
2244 inst->saturate = true;
2245 }
2246 }
2247
2248 void
emit_urb_slot(int mrf,int vert_result)2249 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2250 {
2251 struct brw_reg hw_reg = brw_message_reg(mrf);
2252 dst_reg reg = dst_reg(MRF, mrf);
2253 reg.type = BRW_REGISTER_TYPE_F;
2254
2255 switch (vert_result) {
2256 case VERT_RESULT_PSIZ:
2257 /* PSIZ is always in slot 0, and is coupled with other flags. */
2258 current_annotation = "indices, point width, clip flags";
2259 emit_psiz_and_flags(hw_reg);
2260 break;
2261 case BRW_VERT_RESULT_NDC:
2262 current_annotation = "NDC";
2263 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2264 break;
2265 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2266 case VERT_RESULT_HPOS:
2267 current_annotation = "gl_Position";
2268 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2269 break;
2270 case VERT_RESULT_CLIP_DIST0:
2271 case VERT_RESULT_CLIP_DIST1:
2272 if (this->c->key.uses_clip_distance) {
2273 emit_generic_urb_slot(reg, vert_result);
2274 } else {
2275 current_annotation = "user clip distances";
2276 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2277 }
2278 break;
2279 case VERT_RESULT_EDGE:
2280 /* This is present when doing unfilled polygons. We're supposed to copy
2281 * the edge flag from the user-provided vertex array
2282 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2283 * of that attribute (starts as 1.0f). This is then used in clipping to
2284 * determine which edges should be drawn as wireframe.
2285 */
2286 current_annotation = "edge flag";
2287 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2288 glsl_type::float_type, WRITEMASK_XYZW))));
2289 break;
2290 case BRW_VERT_RESULT_PAD:
2291 /* No need to write to this slot */
2292 break;
2293 default:
2294 emit_generic_urb_slot(reg, vert_result);
2295 break;
2296 }
2297 }
2298
2299 static int
align_interleaved_urb_mlen(struct brw_context * brw,int mlen)2300 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2301 {
2302 struct intel_context *intel = &brw->intel;
2303
2304 if (intel->gen >= 6) {
2305 /* URB data written (does not include the message header reg) must
2306 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2307 * section 5.4.3.2.2: URB_INTERLEAVED.
2308 *
2309 * URB entries are allocated on a multiple of 1024 bits, so an
2310 * extra 128 bits written here to make the end align to 256 is
2311 * no problem.
2312 */
2313 if ((mlen % 2) != 1)
2314 mlen++;
2315 }
2316
2317 return mlen;
2318 }
2319
2320 /**
2321 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2322 * complete the VS thread.
2323 *
2324 * The VUE layout is documented in Volume 2a.
2325 */
2326 void
emit_urb_writes()2327 vec4_visitor::emit_urb_writes()
2328 {
2329 /* MRF 0 is reserved for the debugger, so start with message header
2330 * in MRF 1.
2331 */
2332 int base_mrf = 1;
2333 int mrf = base_mrf;
2334 /* In the process of generating our URB write message contents, we
2335 * may need to unspill a register or load from an array. Those
2336 * reads would use MRFs 14-15.
2337 */
2338 int max_usable_mrf = 13;
2339
2340 /* The following assertion verifies that max_usable_mrf causes an
2341 * even-numbered amount of URB write data, which will meet gen6's
2342 * requirements for length alignment.
2343 */
2344 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2345
2346 /* First mrf is the g0-based message header containing URB handles and such,
2347 * which is implied in VS_OPCODE_URB_WRITE.
2348 */
2349 mrf++;
2350
2351 if (intel->gen < 6) {
2352 emit_ndc_computation();
2353 }
2354
2355 /* Set up the VUE data for the first URB write */
2356 int slot;
2357 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2358 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2359
2360 /* If this was max_usable_mrf, we can't fit anything more into this URB
2361 * WRITE.
2362 */
2363 if (mrf > max_usable_mrf) {
2364 slot++;
2365 break;
2366 }
2367 }
2368
2369 current_annotation = "URB write";
2370 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2371 inst->base_mrf = base_mrf;
2372 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2373 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2374
2375 /* Optional second URB write */
2376 if (!inst->eot) {
2377 mrf = base_mrf + 1;
2378
2379 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2380 assert(mrf < max_usable_mrf);
2381
2382 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2383 }
2384
2385 current_annotation = "URB write";
2386 inst = emit(VS_OPCODE_URB_WRITE);
2387 inst->base_mrf = base_mrf;
2388 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2389 inst->eot = true;
2390 /* URB destination offset. In the previous write, we got MRFs
2391 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2392 * URB row increments, and each of our MRFs is half of one of
2393 * those, since we're doing interleaved writes.
2394 */
2395 inst->offset = (max_usable_mrf - base_mrf) / 2;
2396 }
2397 }
2398
2399 src_reg
get_scratch_offset(vec4_instruction * inst,src_reg * reladdr,int reg_offset)2400 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2401 src_reg *reladdr, int reg_offset)
2402 {
2403 /* Because we store the values to scratch interleaved like our
2404 * vertex data, we need to scale the vec4 index by 2.
2405 */
2406 int message_header_scale = 2;
2407
2408 /* Pre-gen6, the message header uses byte offsets instead of vec4
2409 * (16-byte) offset units.
2410 */
2411 if (intel->gen < 6)
2412 message_header_scale *= 16;
2413
2414 if (reladdr) {
2415 src_reg index = src_reg(this, glsl_type::int_type);
2416
2417 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2418 emit_before(inst, MUL(dst_reg(index),
2419 index, src_reg(message_header_scale)));
2420
2421 return index;
2422 } else {
2423 return src_reg(reg_offset * message_header_scale);
2424 }
2425 }
2426
2427 src_reg
get_pull_constant_offset(vec4_instruction * inst,src_reg * reladdr,int reg_offset)2428 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2429 src_reg *reladdr, int reg_offset)
2430 {
2431 if (reladdr) {
2432 src_reg index = src_reg(this, glsl_type::int_type);
2433
2434 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2435
2436 /* Pre-gen6, the message header uses byte offsets instead of vec4
2437 * (16-byte) offset units.
2438 */
2439 if (intel->gen < 6) {
2440 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2441 }
2442
2443 return index;
2444 } else {
2445 int message_header_scale = intel->gen < 6 ? 16 : 1;
2446 return src_reg(reg_offset * message_header_scale);
2447 }
2448 }
2449
2450 /**
2451 * Emits an instruction before @inst to load the value named by @orig_src
2452 * from scratch space at @base_offset to @temp.
2453 *
2454 * @base_offset is measured in 32-byte units (the size of a register).
2455 */
2456 void
emit_scratch_read(vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)2457 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2458 dst_reg temp, src_reg orig_src,
2459 int base_offset)
2460 {
2461 int reg_offset = base_offset + orig_src.reg_offset;
2462 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2463
2464 emit_before(inst, SCRATCH_READ(temp, index));
2465 }
2466
2467 /**
2468 * Emits an instruction after @inst to store the value to be written
2469 * to @orig_dst to scratch space at @base_offset, from @temp.
2470 *
2471 * @base_offset is measured in 32-byte units (the size of a register).
2472 */
2473 void
emit_scratch_write(vec4_instruction * inst,src_reg temp,dst_reg orig_dst,int base_offset)2474 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2475 src_reg temp, dst_reg orig_dst,
2476 int base_offset)
2477 {
2478 int reg_offset = base_offset + orig_dst.reg_offset;
2479 src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2480
2481 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2482 orig_dst.writemask));
2483 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2484 write->predicate = inst->predicate;
2485 write->ir = inst->ir;
2486 write->annotation = inst->annotation;
2487 inst->insert_after(write);
2488 }
2489
2490 /**
2491 * We can't generally support array access in GRF space, because a
2492 * single instruction's destination can only span 2 contiguous
2493 * registers. So, we send all GRF arrays that get variable index
2494 * access to scratch space.
2495 */
2496 void
move_grf_array_access_to_scratch()2497 vec4_visitor::move_grf_array_access_to_scratch()
2498 {
2499 int scratch_loc[this->virtual_grf_count];
2500
2501 for (int i = 0; i < this->virtual_grf_count; i++) {
2502 scratch_loc[i] = -1;
2503 }
2504
2505 /* First, calculate the set of virtual GRFs that need to be punted
2506 * to scratch due to having any array access on them, and where in
2507 * scratch.
2508 */
2509 foreach_list(node, &this->instructions) {
2510 vec4_instruction *inst = (vec4_instruction *)node;
2511
2512 if (inst->dst.file == GRF && inst->dst.reladdr &&
2513 scratch_loc[inst->dst.reg] == -1) {
2514 scratch_loc[inst->dst.reg] = c->last_scratch;
2515 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2516 }
2517
2518 for (int i = 0 ; i < 3; i++) {
2519 src_reg *src = &inst->src[i];
2520
2521 if (src->file == GRF && src->reladdr &&
2522 scratch_loc[src->reg] == -1) {
2523 scratch_loc[src->reg] = c->last_scratch;
2524 c->last_scratch += this->virtual_grf_sizes[src->reg];
2525 }
2526 }
2527 }
2528
2529 /* Now, for anything that will be accessed through scratch, rewrite
2530 * it to load/store. Note that this is a _safe list walk, because
2531 * we may generate a new scratch_write instruction after the one
2532 * we're processing.
2533 */
2534 foreach_list_safe(node, &this->instructions) {
2535 vec4_instruction *inst = (vec4_instruction *)node;
2536
2537 /* Set up the annotation tracking for new generated instructions. */
2538 base_ir = inst->ir;
2539 current_annotation = inst->annotation;
2540
2541 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2542 src_reg temp = src_reg(this, glsl_type::vec4_type);
2543
2544 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2545
2546 inst->dst.file = temp.file;
2547 inst->dst.reg = temp.reg;
2548 inst->dst.reg_offset = temp.reg_offset;
2549 inst->dst.reladdr = NULL;
2550 }
2551
2552 for (int i = 0 ; i < 3; i++) {
2553 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2554 continue;
2555
2556 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2557
2558 emit_scratch_read(inst, temp, inst->src[i],
2559 scratch_loc[inst->src[i].reg]);
2560
2561 inst->src[i].file = temp.file;
2562 inst->src[i].reg = temp.reg;
2563 inst->src[i].reg_offset = temp.reg_offset;
2564 inst->src[i].reladdr = NULL;
2565 }
2566 }
2567 }
2568
2569 /**
2570 * Emits an instruction before @inst to load the value named by @orig_src
2571 * from the pull constant buffer (surface) at @base_offset to @temp.
2572 */
2573 void
emit_pull_constant_load(vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)2574 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2575 dst_reg temp, src_reg orig_src,
2576 int base_offset)
2577 {
2578 int reg_offset = base_offset + orig_src.reg_offset;
2579 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2580 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2581 vec4_instruction *load;
2582
2583 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2584 temp, index, offset);
2585 load->base_mrf = 14;
2586 load->mlen = 1;
2587 emit_before(inst, load);
2588 }
2589
2590 /**
2591 * Implements array access of uniforms by inserting a
2592 * PULL_CONSTANT_LOAD instruction.
2593 *
2594 * Unlike temporary GRF array access (where we don't support it due to
2595 * the difficulty of doing relative addressing on instruction
2596 * destinations), we could potentially do array access of uniforms
2597 * that were loaded in GRF space as push constants. In real-world
2598 * usage we've seen, though, the arrays being used are always larger
2599 * than we could load as push constants, so just always move all
2600 * uniform array access out to a pull constant buffer.
2601 */
2602 void
move_uniform_array_access_to_pull_constants()2603 vec4_visitor::move_uniform_array_access_to_pull_constants()
2604 {
2605 int pull_constant_loc[this->uniforms];
2606
2607 for (int i = 0; i < this->uniforms; i++) {
2608 pull_constant_loc[i] = -1;
2609 }
2610
2611 /* Walk through and find array access of uniforms. Put a copy of that
2612 * uniform in the pull constant buffer.
2613 *
2614 * Note that we don't move constant-indexed accesses to arrays. No
2615 * testing has been done of the performance impact of this choice.
2616 */
2617 foreach_list_safe(node, &this->instructions) {
2618 vec4_instruction *inst = (vec4_instruction *)node;
2619
2620 for (int i = 0 ; i < 3; i++) {
2621 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2622 continue;
2623
2624 int uniform = inst->src[i].reg;
2625
2626 /* If this array isn't already present in the pull constant buffer,
2627 * add it.
2628 */
2629 if (pull_constant_loc[uniform] == -1) {
2630 const float **values = &prog_data->param[uniform * 4];
2631
2632 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2633
2634 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2635 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2636 }
2637 }
2638
2639 /* Set up the annotation tracking for new generated instructions. */
2640 base_ir = inst->ir;
2641 current_annotation = inst->annotation;
2642
2643 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2644
2645 emit_pull_constant_load(inst, temp, inst->src[i],
2646 pull_constant_loc[uniform]);
2647
2648 inst->src[i].file = temp.file;
2649 inst->src[i].reg = temp.reg;
2650 inst->src[i].reg_offset = temp.reg_offset;
2651 inst->src[i].reladdr = NULL;
2652 }
2653 }
2654
2655 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2656 * no need to track them as larger-than-vec4 objects. This will be
2657 * relied on in cutting out unused uniform vectors from push
2658 * constants.
2659 */
2660 split_uniform_registers();
2661 }
2662
2663 void
resolve_ud_negate(src_reg * reg)2664 vec4_visitor::resolve_ud_negate(src_reg *reg)
2665 {
2666 if (reg->type != BRW_REGISTER_TYPE_UD ||
2667 !reg->negate)
2668 return;
2669
2670 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2671 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2672 *reg = temp;
2673 }
2674
vec4_visitor(struct brw_vs_compile * c,struct gl_shader_program * prog,struct brw_shader * shader)2675 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2676 struct gl_shader_program *prog,
2677 struct brw_shader *shader)
2678 {
2679 this->c = c;
2680 this->p = &c->func;
2681 this->brw = p->brw;
2682 this->intel = &brw->intel;
2683 this->ctx = &intel->ctx;
2684 this->prog = prog;
2685 this->shader = shader;
2686
2687 this->mem_ctx = ralloc_context(NULL);
2688 this->failed = false;
2689
2690 this->base_ir = NULL;
2691 this->current_annotation = NULL;
2692
2693 this->c = c;
2694 this->vp = (struct gl_vertex_program *)
2695 prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2696 this->prog_data = &c->prog_data;
2697
2698 this->variable_ht = hash_table_ctor(0,
2699 hash_table_pointer_hash,
2700 hash_table_pointer_compare);
2701
2702 this->virtual_grf_def = NULL;
2703 this->virtual_grf_use = NULL;
2704 this->virtual_grf_sizes = NULL;
2705 this->virtual_grf_count = 0;
2706 this->virtual_grf_reg_map = NULL;
2707 this->virtual_grf_reg_count = 0;
2708 this->virtual_grf_array_size = 0;
2709 this->live_intervals_valid = false;
2710
2711 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2712
2713 this->uniforms = 0;
2714 }
2715
~vec4_visitor()2716 vec4_visitor::~vec4_visitor()
2717 {
2718 ralloc_free(this->mem_ctx);
2719 hash_table_dtor(this->variable_ht);
2720 }
2721
2722
2723 void
fail(const char * format,...)2724 vec4_visitor::fail(const char *format, ...)
2725 {
2726 va_list va;
2727 char *msg;
2728
2729 if (failed)
2730 return;
2731
2732 failed = true;
2733
2734 va_start(va, format);
2735 msg = ralloc_vasprintf(mem_ctx, format, va);
2736 va_end(va);
2737 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2738
2739 this->fail_msg = msg;
2740
2741 if (INTEL_DEBUG & DEBUG_VS) {
2742 fprintf(stderr, "%s", msg);
2743 }
2744 }
2745
2746 } /* namespace brw */
2747