1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 #include "brw_nir.h"
28 #include "brw_vec4_builder.h"
29 #include "brw_vec4_live_variables.h"
30 #include "brw_vec4_vs.h"
31 #include "brw_dead_control_flow.h"
32 #include "common/gen_debug.h"
33 #include "program/prog_parameter.h"
34 
35 #define MAX_INSTRUCTION (1 << 30)
36 
37 using namespace brw;
38 
39 namespace brw {
40 
41 void
init()42 src_reg::init()
43 {
44    memset(this, 0, sizeof(*this));
45    this->file = BAD_FILE;
46    this->type = BRW_REGISTER_TYPE_UD;
47 }
48 
src_reg(enum brw_reg_file file,int nr,const glsl_type * type)49 src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
50 {
51    init();
52 
53    this->file = file;
54    this->nr = nr;
55    if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
56       this->swizzle = brw_swizzle_for_size(type->vector_elements);
57    else
58       this->swizzle = BRW_SWIZZLE_XYZW;
59    if (type)
60       this->type = brw_type_for_base_type(type);
61 }
62 
63 /** Generic unset register constructor. */
src_reg()64 src_reg::src_reg()
65 {
66    init();
67 }
68 
src_reg(struct::brw_reg reg)69 src_reg::src_reg(struct ::brw_reg reg) :
70    backend_reg(reg)
71 {
72    this->offset = 0;
73    this->reladdr = NULL;
74 }
75 
src_reg(const dst_reg & reg)76 src_reg::src_reg(const dst_reg &reg) :
77    backend_reg(reg)
78 {
79    this->reladdr = reg.reladdr;
80    this->swizzle = brw_swizzle_for_mask(reg.writemask);
81 }
82 
83 void
init()84 dst_reg::init()
85 {
86    memset(this, 0, sizeof(*this));
87    this->file = BAD_FILE;
88    this->type = BRW_REGISTER_TYPE_UD;
89    this->writemask = WRITEMASK_XYZW;
90 }
91 
dst_reg()92 dst_reg::dst_reg()
93 {
94    init();
95 }
96 
dst_reg(enum brw_reg_file file,int nr)97 dst_reg::dst_reg(enum brw_reg_file file, int nr)
98 {
99    init();
100 
101    this->file = file;
102    this->nr = nr;
103 }
104 
dst_reg(enum brw_reg_file file,int nr,const glsl_type * type,unsigned writemask)105 dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
106                  unsigned writemask)
107 {
108    init();
109 
110    this->file = file;
111    this->nr = nr;
112    this->type = brw_type_for_base_type(type);
113    this->writemask = writemask;
114 }
115 
dst_reg(enum brw_reg_file file,int nr,brw_reg_type type,unsigned writemask)116 dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
117                  unsigned writemask)
118 {
119    init();
120 
121    this->file = file;
122    this->nr = nr;
123    this->type = type;
124    this->writemask = writemask;
125 }
126 
dst_reg(struct::brw_reg reg)127 dst_reg::dst_reg(struct ::brw_reg reg) :
128    backend_reg(reg)
129 {
130    this->offset = 0;
131    this->reladdr = NULL;
132 }
133 
dst_reg(const src_reg & reg)134 dst_reg::dst_reg(const src_reg &reg) :
135    backend_reg(reg)
136 {
137    this->writemask = brw_mask_for_swizzle(reg.swizzle);
138    this->reladdr = reg.reladdr;
139 }
140 
141 bool
equals(const dst_reg & r) const142 dst_reg::equals(const dst_reg &r) const
143 {
144    return (this->backend_reg::equals(r) &&
145            (reladdr == r.reladdr ||
146             (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
147 }
148 
149 bool
is_send_from_grf()150 vec4_instruction::is_send_from_grf()
151 {
152    switch (opcode) {
153    case SHADER_OPCODE_SHADER_TIME_ADD:
154    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
155    case SHADER_OPCODE_UNTYPED_ATOMIC:
156    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
157    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
158    case SHADER_OPCODE_TYPED_ATOMIC:
159    case SHADER_OPCODE_TYPED_SURFACE_READ:
160    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
161    case VEC4_OPCODE_URB_READ:
162    case TCS_OPCODE_URB_WRITE:
163    case TCS_OPCODE_RELEASE_INPUT:
164    case SHADER_OPCODE_BARRIER:
165       return true;
166    default:
167       return false;
168    }
169 }
170 
171 /**
172  * Returns true if this instruction's sources and destinations cannot
173  * safely be the same register.
174  *
175  * In most cases, a register can be written over safely by the same
176  * instruction that is its last use.  For a single instruction, the
177  * sources are dereferenced before writing of the destination starts
178  * (naturally).
179  *
180  * However, there are a few cases where this can be problematic:
181  *
182  * - Virtual opcodes that translate to multiple instructions in the
183  *   code generator: if src == dst and one instruction writes the
184  *   destination before a later instruction reads the source, then
185  *   src will have been clobbered.
186  *
187  * The register allocator uses this information to set up conflicts between
188  * GRF sources and the destination.
189  */
190 bool
has_source_and_destination_hazard() const191 vec4_instruction::has_source_and_destination_hazard() const
192 {
193    switch (opcode) {
194    case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
195    case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
196    case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
197       return true;
198    default:
199       /* 8-wide compressed DF operations are executed as two 4-wide operations,
200        * so we have a src/dst hazard if the first half of the instruction
201        * overwrites the source of the second half. Prevent this by marking
202        * compressed instructions as having src/dst hazards, so the register
203        * allocator assigns safe register regions for dst and srcs.
204        */
205       return size_written > REG_SIZE;
206    }
207 }
208 
209 unsigned
size_read(unsigned arg) const210 vec4_instruction::size_read(unsigned arg) const
211 {
212    switch (opcode) {
213    case SHADER_OPCODE_SHADER_TIME_ADD:
214    case SHADER_OPCODE_UNTYPED_ATOMIC:
215    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
216    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
217    case SHADER_OPCODE_TYPED_ATOMIC:
218    case SHADER_OPCODE_TYPED_SURFACE_READ:
219    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
220    case TCS_OPCODE_URB_WRITE:
221       if (arg == 0)
222          return mlen * REG_SIZE;
223       break;
224    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
225       if (arg == 1)
226          return mlen * REG_SIZE;
227       break;
228    default:
229       break;
230    }
231 
232    switch (src[arg].file) {
233    case BAD_FILE:
234       return 0;
235    case IMM:
236    case UNIFORM:
237       return 4 * type_sz(src[arg].type);
238    default:
239       /* XXX - Represent actual vertical stride. */
240       return exec_size * type_sz(src[arg].type);
241    }
242 }
243 
244 bool
can_do_source_mods(const struct gen_device_info * devinfo)245 vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo)
246 {
247    if (devinfo->gen == 6 && is_math())
248       return false;
249 
250    if (is_send_from_grf())
251       return false;
252 
253    if (!backend_instruction::can_do_source_mods())
254       return false;
255 
256    return true;
257 }
258 
259 bool
can_do_writemask(const struct gen_device_info * devinfo)260 vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
261 {
262    switch (opcode) {
263    case SHADER_OPCODE_GEN4_SCRATCH_READ:
264    case VEC4_OPCODE_DOUBLE_TO_F32:
265    case VEC4_OPCODE_DOUBLE_TO_D32:
266    case VEC4_OPCODE_DOUBLE_TO_U32:
267    case VEC4_OPCODE_TO_DOUBLE:
268    case VEC4_OPCODE_PICK_LOW_32BIT:
269    case VEC4_OPCODE_PICK_HIGH_32BIT:
270    case VEC4_OPCODE_SET_LOW_32BIT:
271    case VEC4_OPCODE_SET_HIGH_32BIT:
272    case VS_OPCODE_PULL_CONSTANT_LOAD:
273    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
274    case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
275    case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
276    case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
277    case TES_OPCODE_CREATE_INPUT_READ_HEADER:
278    case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
279    case VEC4_OPCODE_URB_READ:
280    case SHADER_OPCODE_MOV_INDIRECT:
281       return false;
282    default:
283       /* The MATH instruction on Gen6 only executes in align1 mode, which does
284        * not support writemasking.
285        */
286       if (devinfo->gen == 6 && is_math())
287          return false;
288 
289       if (is_tex())
290          return false;
291 
292       return true;
293    }
294 }
295 
296 bool
can_change_types() const297 vec4_instruction::can_change_types() const
298 {
299    return dst.type == src[0].type &&
300           !src[0].abs && !src[0].negate && !saturate &&
301           (opcode == BRW_OPCODE_MOV ||
302            (opcode == BRW_OPCODE_SEL &&
303             dst.type == src[1].type &&
304             predicate != BRW_PREDICATE_NONE &&
305             !src[1].abs && !src[1].negate));
306 }
307 
308 /**
309  * Returns how many MRFs an opcode will write over.
310  *
311  * Note that this is not the 0 or 1 implied writes in an actual gen
312  * instruction -- the generate_* functions generate additional MOVs
313  * for setup.
314  */
315 int
implied_mrf_writes(vec4_instruction * inst)316 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
317 {
318    if (inst->mlen == 0 || inst->is_send_from_grf())
319       return 0;
320 
321    switch (inst->opcode) {
322    case SHADER_OPCODE_RCP:
323    case SHADER_OPCODE_RSQ:
324    case SHADER_OPCODE_SQRT:
325    case SHADER_OPCODE_EXP2:
326    case SHADER_OPCODE_LOG2:
327    case SHADER_OPCODE_SIN:
328    case SHADER_OPCODE_COS:
329       return 1;
330    case SHADER_OPCODE_INT_QUOTIENT:
331    case SHADER_OPCODE_INT_REMAINDER:
332    case SHADER_OPCODE_POW:
333    case TCS_OPCODE_THREAD_END:
334       return 2;
335    case VS_OPCODE_URB_WRITE:
336       return 1;
337    case VS_OPCODE_PULL_CONSTANT_LOAD:
338       return 2;
339    case SHADER_OPCODE_GEN4_SCRATCH_READ:
340       return 2;
341    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
342       return 3;
343    case GS_OPCODE_URB_WRITE:
344    case GS_OPCODE_URB_WRITE_ALLOCATE:
345    case GS_OPCODE_THREAD_END:
346       return 0;
347    case GS_OPCODE_FF_SYNC:
348       return 1;
349    case TCS_OPCODE_URB_WRITE:
350       return 0;
351    case SHADER_OPCODE_SHADER_TIME_ADD:
352       return 0;
353    case SHADER_OPCODE_TEX:
354    case SHADER_OPCODE_TXL:
355    case SHADER_OPCODE_TXD:
356    case SHADER_OPCODE_TXF:
357    case SHADER_OPCODE_TXF_CMS:
358    case SHADER_OPCODE_TXF_CMS_W:
359    case SHADER_OPCODE_TXF_MCS:
360    case SHADER_OPCODE_TXS:
361    case SHADER_OPCODE_TG4:
362    case SHADER_OPCODE_TG4_OFFSET:
363    case SHADER_OPCODE_SAMPLEINFO:
364    case SHADER_OPCODE_GET_BUFFER_SIZE:
365       return inst->header_size;
366    default:
367       unreachable("not reached");
368    }
369 }
370 
371 bool
equals(const src_reg & r) const372 src_reg::equals(const src_reg &r) const
373 {
374    return (this->backend_reg::equals(r) &&
375 	   !reladdr && !r.reladdr);
376 }
377 
378 bool
opt_vector_float()379 vec4_visitor::opt_vector_float()
380 {
381    bool progress = false;
382 
383    foreach_block(block, cfg) {
384       int last_reg = -1, last_offset = -1;
385       enum brw_reg_file last_reg_file = BAD_FILE;
386 
387       uint8_t imm[4] = { 0 };
388       int inst_count = 0;
389       vec4_instruction *imm_inst[4];
390       unsigned writemask = 0;
391       enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
392 
393       foreach_inst_in_block_safe(vec4_instruction, inst, block) {
394          int vf = -1;
395          enum brw_reg_type need_type;
396 
397          /* Look for unconditional MOVs from an immediate with a partial
398           * writemask.  Skip type-conversion MOVs other than integer 0,
399           * where the type doesn't matter.  See if the immediate can be
400           * represented as a VF.
401           */
402          if (inst->opcode == BRW_OPCODE_MOV &&
403              inst->src[0].file == IMM &&
404              inst->predicate == BRW_PREDICATE_NONE &&
405              inst->dst.writemask != WRITEMASK_XYZW &&
406              type_sz(inst->src[0].type) < 8 &&
407              (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
408 
409             vf = brw_float_to_vf(inst->src[0].d);
410             need_type = BRW_REGISTER_TYPE_D;
411 
412             if (vf == -1) {
413                vf = brw_float_to_vf(inst->src[0].f);
414                need_type = BRW_REGISTER_TYPE_F;
415             }
416          } else {
417             last_reg = -1;
418          }
419 
420          /* If this wasn't a MOV, or the destination register doesn't match,
421           * or we have to switch destination types, then this breaks our
422           * sequence.  Combine anything we've accumulated so far.
423           */
424          if (last_reg != inst->dst.nr ||
425              last_offset != inst->dst.offset ||
426              last_reg_file != inst->dst.file ||
427              (vf > 0 && dest_type != need_type)) {
428 
429             if (inst_count > 1) {
430                unsigned vf;
431                memcpy(&vf, imm, sizeof(vf));
432                vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
433                mov->dst.type = dest_type;
434                mov->dst.writemask = writemask;
435                inst->insert_before(block, mov);
436 
437                for (int i = 0; i < inst_count; i++) {
438                   imm_inst[i]->remove(block);
439                }
440 
441                progress = true;
442             }
443 
444             inst_count = 0;
445             last_reg = -1;
446             writemask = 0;
447             dest_type = BRW_REGISTER_TYPE_F;
448 
449             for (int i = 0; i < 4; i++) {
450                imm[i] = 0;
451             }
452          }
453 
454          /* Record this instruction's value (if it was representable). */
455          if (vf != -1) {
456             if ((inst->dst.writemask & WRITEMASK_X) != 0)
457                imm[0] = vf;
458             if ((inst->dst.writemask & WRITEMASK_Y) != 0)
459                imm[1] = vf;
460             if ((inst->dst.writemask & WRITEMASK_Z) != 0)
461                imm[2] = vf;
462             if ((inst->dst.writemask & WRITEMASK_W) != 0)
463                imm[3] = vf;
464 
465             writemask |= inst->dst.writemask;
466             imm_inst[inst_count++] = inst;
467 
468             last_reg = inst->dst.nr;
469             last_offset = inst->dst.offset;
470             last_reg_file = inst->dst.file;
471             if (vf > 0)
472                dest_type = need_type;
473          }
474       }
475    }
476 
477    if (progress)
478       invalidate_live_intervals();
479 
480    return progress;
481 }
482 
483 /* Replaces unused channels of a swizzle with channels that are used.
484  *
485  * For instance, this pass transforms
486  *
487  *    mov vgrf4.yz, vgrf5.wxzy
488  *
489  * into
490  *
491  *    mov vgrf4.yz, vgrf5.xxzx
492  *
493  * This eliminates false uses of some channels, letting dead code elimination
494  * remove the instructions that wrote them.
495  */
496 bool
opt_reduce_swizzle()497 vec4_visitor::opt_reduce_swizzle()
498 {
499    bool progress = false;
500 
501    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
502       if (inst->dst.file == BAD_FILE ||
503           inst->dst.file == ARF ||
504           inst->dst.file == FIXED_GRF ||
505           inst->is_send_from_grf())
506          continue;
507 
508       unsigned swizzle;
509 
510       /* Determine which channels of the sources are read. */
511       switch (inst->opcode) {
512       case VEC4_OPCODE_PACK_BYTES:
513       case BRW_OPCODE_DP4:
514       case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
515                             *           but all four of src1.
516                             */
517          swizzle = brw_swizzle_for_size(4);
518          break;
519       case BRW_OPCODE_DP3:
520          swizzle = brw_swizzle_for_size(3);
521          break;
522       case BRW_OPCODE_DP2:
523          swizzle = brw_swizzle_for_size(2);
524          break;
525 
526       case VEC4_OPCODE_TO_DOUBLE:
527       case VEC4_OPCODE_DOUBLE_TO_F32:
528       case VEC4_OPCODE_DOUBLE_TO_D32:
529       case VEC4_OPCODE_DOUBLE_TO_U32:
530       case VEC4_OPCODE_PICK_LOW_32BIT:
531       case VEC4_OPCODE_PICK_HIGH_32BIT:
532       case VEC4_OPCODE_SET_LOW_32BIT:
533       case VEC4_OPCODE_SET_HIGH_32BIT:
534          swizzle = brw_swizzle_for_size(4);
535          break;
536 
537       default:
538          swizzle = brw_swizzle_for_mask(inst->dst.writemask);
539          break;
540       }
541 
542       /* Update sources' swizzles. */
543       for (int i = 0; i < 3; i++) {
544          if (inst->src[i].file != VGRF &&
545              inst->src[i].file != ATTR &&
546              inst->src[i].file != UNIFORM)
547             continue;
548 
549          const unsigned new_swizzle =
550             brw_compose_swizzle(swizzle, inst->src[i].swizzle);
551          if (inst->src[i].swizzle != new_swizzle) {
552             inst->src[i].swizzle = new_swizzle;
553             progress = true;
554          }
555       }
556    }
557 
558    if (progress)
559       invalidate_live_intervals();
560 
561    return progress;
562 }
563 
564 void
split_uniform_registers()565 vec4_visitor::split_uniform_registers()
566 {
567    /* Prior to this, uniforms have been in an array sized according to
568     * the number of vector uniforms present, sparsely filled (so an
569     * aggregate results in reg indices being skipped over).  Now we're
570     * going to cut those aggregates up so each .nr index is one
571     * vector.  The goal is to make elimination of unused uniform
572     * components easier later.
573     */
574    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
575       for (int i = 0 ; i < 3; i++) {
576 	 if (inst->src[i].file != UNIFORM)
577 	    continue;
578 
579 	 assert(!inst->src[i].reladdr);
580 
581          inst->src[i].nr += inst->src[i].offset / 16;
582 	 inst->src[i].offset %= 16;
583       }
584    }
585 }
586 
587 /* This function returns the register number where we placed the uniform */
588 static int
set_push_constant_loc(const int nr_uniforms,int * new_uniform_count,const int src,const int size,const int channel_size,int * new_loc,int * new_chan,int * new_chans_used)589 set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
590                       const int src, const int size, const int channel_size,
591                       int *new_loc, int *new_chan,
592                       int *new_chans_used)
593 {
594    int dst;
595    /* Find the lowest place we can slot this uniform in. */
596    for (dst = 0; dst < nr_uniforms; dst++) {
597       if (ALIGN(new_chans_used[dst], channel_size) + size <= 4)
598          break;
599    }
600 
601    assert(dst < nr_uniforms);
602 
603    new_loc[src] = dst;
604    new_chan[src] = ALIGN(new_chans_used[dst], channel_size);
605    new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size;
606 
607    *new_uniform_count = MAX2(*new_uniform_count, dst + 1);
608    return dst;
609 }
610 
611 void
pack_uniform_registers()612 vec4_visitor::pack_uniform_registers()
613 {
614    uint8_t chans_used[this->uniforms];
615    int new_loc[this->uniforms];
616    int new_chan[this->uniforms];
617    bool is_aligned_to_dvec4[this->uniforms];
618    int new_chans_used[this->uniforms];
619    int channel_sizes[this->uniforms];
620 
621    memset(chans_used, 0, sizeof(chans_used));
622    memset(new_loc, 0, sizeof(new_loc));
623    memset(new_chan, 0, sizeof(new_chan));
624    memset(new_chans_used, 0, sizeof(new_chans_used));
625    memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4));
626    memset(channel_sizes, 0, sizeof(channel_sizes));
627 
628    /* Find which uniform vectors are actually used by the program.  We
629     * expect unused vector elements when we've moved array access out
630     * to pull constants, and from some GLSL code generators like wine.
631     */
632    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
633       unsigned readmask;
634       switch (inst->opcode) {
635       case VEC4_OPCODE_PACK_BYTES:
636       case BRW_OPCODE_DP4:
637       case BRW_OPCODE_DPH:
638          readmask = 0xf;
639          break;
640       case BRW_OPCODE_DP3:
641          readmask = 0x7;
642          break;
643       case BRW_OPCODE_DP2:
644          readmask = 0x3;
645          break;
646       default:
647          readmask = inst->dst.writemask;
648          break;
649       }
650 
651       for (int i = 0 ; i < 3; i++) {
652          if (inst->src[i].file != UNIFORM)
653             continue;
654 
655          assert(type_sz(inst->src[i].type) % 4 == 0);
656          int channel_size = type_sz(inst->src[i].type) / 4;
657 
658          int reg = inst->src[i].nr;
659          for (int c = 0; c < 4; c++) {
660             if (!(readmask & (1 << c)))
661                continue;
662 
663             unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
664             unsigned used = MAX2(chans_used[reg], channel * channel_size);
665             if (used <= 4) {
666                chans_used[reg] = used;
667                channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size);
668             } else {
669                is_aligned_to_dvec4[reg] = true;
670                is_aligned_to_dvec4[reg + 1] = true;
671                chans_used[reg + 1] = used - 4;
672                channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size);
673             }
674          }
675       }
676 
677       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
678           inst->src[0].file == UNIFORM) {
679          assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
680          assert(inst->src[0].subnr == 0);
681 
682          unsigned bytes_read = inst->src[2].ud;
683          assert(bytes_read % 4 == 0);
684          unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
685 
686          /* We just mark every register touched by a MOV_INDIRECT as being
687           * fully used.  This ensures that it doesn't broken up piecewise by
688           * the next part of our packing algorithm.
689           */
690          int reg = inst->src[0].nr;
691          int channel_size = type_sz(inst->src[0].type) / 4;
692          for (unsigned i = 0; i < vec4s_read; i++) {
693             chans_used[reg + i] = 4;
694             channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size);
695          }
696       }
697    }
698 
699    int new_uniform_count = 0;
700 
701    /* As the uniforms are going to be reordered, take the data from a temporary
702     * copy of the original param[].
703     */
704    uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params);
705    memcpy(param, stage_prog_data->param,
706           sizeof(uint32_t) * stage_prog_data->nr_params);
707 
708    /* Now, figure out a packing of the live uniform vectors into our
709     * push constants. Start with dvec{3,4} because they are aligned to
710     * dvec4 size (2 vec4).
711     */
712    for (int src = 0; src < uniforms; src++) {
713       int size = chans_used[src];
714 
715       if (size == 0 || !is_aligned_to_dvec4[src])
716          continue;
717 
718       /* dvec3 are aligned to dvec4 size, apply the alignment of the size
719        * to 4 to avoid moving last component of a dvec3 to the available
720        * location at the end of a previous dvec3. These available locations
721        * could be filled by smaller variables in next loop.
722        */
723       size = ALIGN(size, 4);
724       int dst = set_push_constant_loc(uniforms, &new_uniform_count,
725                                       src, size, channel_sizes[src],
726                                       new_loc, new_chan,
727                                       new_chans_used);
728       /* Move the references to the data */
729       for (int j = 0; j < size; j++) {
730          stage_prog_data->param[dst * 4 + new_chan[src] + j] =
731             param[src * 4 + j];
732       }
733    }
734 
735    /* Continue with the rest of data, which is aligned to vec4. */
736    for (int src = 0; src < uniforms; src++) {
737       int size = chans_used[src];
738 
739       if (size == 0 || is_aligned_to_dvec4[src])
740          continue;
741 
742       int dst = set_push_constant_loc(uniforms, &new_uniform_count,
743                                       src, size, channel_sizes[src],
744                                       new_loc, new_chan,
745                                       new_chans_used);
746       /* Move the references to the data */
747       for (int j = 0; j < size; j++) {
748          stage_prog_data->param[dst * 4 + new_chan[src] + j] =
749             param[src * 4 + j];
750       }
751    }
752 
753    ralloc_free(param);
754    this->uniforms = new_uniform_count;
755 
756    /* Now, update the instructions for our repacked uniforms. */
757    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
758       for (int i = 0 ; i < 3; i++) {
759          int src = inst->src[i].nr;
760 
761          if (inst->src[i].file != UNIFORM)
762             continue;
763 
764          int chan = new_chan[src] / channel_sizes[src];
765          inst->src[i].nr = new_loc[src];
766          inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan);
767       }
768    }
769 }
770 
771 /**
772  * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
773  *
774  * While GLSL IR also performs this optimization, we end up with it in
775  * our instruction stream for a couple of reasons.  One is that we
776  * sometimes generate silly instructions, for example in array access
777  * where we'll generate "ADD offset, index, base" even if base is 0.
778  * The other is that GLSL IR's constant propagation doesn't track the
779  * components of aggregates, so some VS patterns (initialize matrix to
780  * 0, accumulate in vertex blending factors) end up breaking down to
781  * instructions involving 0.
782  */
783 bool
opt_algebraic()784 vec4_visitor::opt_algebraic()
785 {
786    bool progress = false;
787 
788    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
789       switch (inst->opcode) {
790       case BRW_OPCODE_MOV:
791          if (inst->src[0].file != IMM)
792             break;
793 
794          if (inst->saturate) {
795             if (inst->dst.type != inst->src[0].type)
796                assert(!"unimplemented: saturate mixed types");
797 
798             if (brw_saturate_immediate(inst->dst.type,
799                                        &inst->src[0].as_brw_reg())) {
800                inst->saturate = false;
801                progress = true;
802             }
803          }
804          break;
805 
806       case VEC4_OPCODE_UNPACK_UNIFORM:
807          if (inst->src[0].file != UNIFORM) {
808             inst->opcode = BRW_OPCODE_MOV;
809             progress = true;
810          }
811          break;
812 
813       case BRW_OPCODE_ADD:
814 	 if (inst->src[1].is_zero()) {
815 	    inst->opcode = BRW_OPCODE_MOV;
816 	    inst->src[1] = src_reg();
817 	    progress = true;
818 	 }
819 	 break;
820 
821       case BRW_OPCODE_MUL:
822 	 if (inst->src[1].is_zero()) {
823 	    inst->opcode = BRW_OPCODE_MOV;
824 	    switch (inst->src[0].type) {
825 	    case BRW_REGISTER_TYPE_F:
826 	       inst->src[0] = brw_imm_f(0.0f);
827 	       break;
828 	    case BRW_REGISTER_TYPE_D:
829 	       inst->src[0] = brw_imm_d(0);
830 	       break;
831 	    case BRW_REGISTER_TYPE_UD:
832 	       inst->src[0] = brw_imm_ud(0u);
833 	       break;
834 	    default:
835 	       unreachable("not reached");
836 	    }
837 	    inst->src[1] = src_reg();
838 	    progress = true;
839 	 } else if (inst->src[1].is_one()) {
840 	    inst->opcode = BRW_OPCODE_MOV;
841 	    inst->src[1] = src_reg();
842 	    progress = true;
843          } else if (inst->src[1].is_negative_one()) {
844             inst->opcode = BRW_OPCODE_MOV;
845             inst->src[0].negate = !inst->src[0].negate;
846             inst->src[1] = src_reg();
847             progress = true;
848 	 }
849 	 break;
850       case BRW_OPCODE_CMP:
851          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
852              inst->src[0].abs &&
853              inst->src[0].negate &&
854              inst->src[1].is_zero()) {
855             inst->src[0].abs = false;
856             inst->src[0].negate = false;
857             inst->conditional_mod = BRW_CONDITIONAL_Z;
858             progress = true;
859             break;
860          }
861          break;
862       case SHADER_OPCODE_BROADCAST:
863          if (is_uniform(inst->src[0]) ||
864              inst->src[1].is_zero()) {
865             inst->opcode = BRW_OPCODE_MOV;
866             inst->src[1] = src_reg();
867             inst->force_writemask_all = true;
868             progress = true;
869          }
870          break;
871 
872       default:
873 	 break;
874       }
875    }
876 
877    if (progress)
878       invalidate_live_intervals();
879 
880    return progress;
881 }
882 
883 /**
884  * Only a limited number of hardware registers may be used for push
885  * constants, so this turns access to the overflowed constants into
886  * pull constants.
887  */
888 void
move_push_constants_to_pull_constants()889 vec4_visitor::move_push_constants_to_pull_constants()
890 {
891    int pull_constant_loc[this->uniforms];
892 
893    /* Only allow 32 registers (256 uniform components) as push constants,
894     * which is the limit on gen6.
895     *
896     * If changing this value, note the limitation about total_regs in
897     * brw_curbe.c.
898     */
899    int max_uniform_components = 32 * 8;
900    if (this->uniforms * 4 <= max_uniform_components)
901       return;
902 
903    /* Make some sort of choice as to which uniforms get sent to pull
904     * constants.  We could potentially do something clever here like
905     * look for the most infrequently used uniform vec4s, but leave
906     * that for later.
907     */
908    for (int i = 0; i < this->uniforms * 4; i += 4) {
909       pull_constant_loc[i / 4] = -1;
910 
911       if (i >= max_uniform_components) {
912          uint32_t *values = &stage_prog_data->param[i];
913 
914          /* Try to find an existing copy of this uniform in the pull
915           * constants if it was part of an array access already.
916           */
917          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
918             int matches;
919 
920             for (matches = 0; matches < 4; matches++) {
921                if (stage_prog_data->pull_param[j + matches] != values[matches])
922                   break;
923             }
924 
925             if (matches == 4) {
926                pull_constant_loc[i / 4] = j / 4;
927                break;
928             }
929          }
930 
931          if (pull_constant_loc[i / 4] == -1) {
932             assert(stage_prog_data->nr_pull_params % 4 == 0);
933             pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
934 
935             for (int j = 0; j < 4; j++) {
936                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
937                   values[j];
938             }
939          }
940       }
941    }
942 
943    /* Now actually rewrite usage of the things we've moved to pull
944     * constants.
945     */
946    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
947       for (int i = 0 ; i < 3; i++) {
948          if (inst->src[i].file != UNIFORM ||
949              pull_constant_loc[inst->src[i].nr] == -1)
950             continue;
951 
952          int uniform = inst->src[i].nr;
953 
954          const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
955             glsl_type::dvec4_type : glsl_type::vec4_type;
956          dst_reg temp = dst_reg(this, temp_type);
957 
958          emit_pull_constant_load(block, inst, temp, inst->src[i],
959                                  pull_constant_loc[uniform], src_reg());
960 
961          inst->src[i].file = temp.file;
962          inst->src[i].nr = temp.nr;
963          inst->src[i].offset %= 16;
964          inst->src[i].reladdr = NULL;
965       }
966    }
967 
968    /* Repack push constants to remove the now-unused ones. */
969    pack_uniform_registers();
970 }
971 
972 /* Conditions for which we want to avoid setting the dependency control bits */
973 bool
is_dep_ctrl_unsafe(const vec4_instruction * inst)974 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
975 {
976 #define IS_DWORD(reg) \
977    (reg.type == BRW_REGISTER_TYPE_UD || \
978     reg.type == BRW_REGISTER_TYPE_D)
979 
980 #define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
981 
982    /* From the Cherryview and Broadwell PRMs:
983     *
984     * "When source or destination datatype is 64b or operation is integer DWord
985     * multiply, DepCtrl must not be used."
986     *
987     * SKL PRMs don't include this restriction, however, gen7 seems to be
988     * affected, at least by the 64b restriction, since DepCtrl with double
989     * precision instructions seems to produce GPU hangs in some cases.
990     */
991    if (devinfo->gen == 8 || gen_device_info_is_9lp(devinfo)) {
992       if (inst->opcode == BRW_OPCODE_MUL &&
993          IS_DWORD(inst->src[0]) &&
994          IS_DWORD(inst->src[1]))
995          return true;
996    }
997 
998    if (devinfo->gen >= 7 && devinfo->gen <= 8) {
999       if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
1000           IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
1001       return true;
1002    }
1003 
1004 #undef IS_64BIT
1005 #undef IS_DWORD
1006 
1007    if (devinfo->gen >= 8) {
1008       if (inst->opcode == BRW_OPCODE_F32TO16)
1009          return true;
1010    }
1011 
1012    /*
1013     * mlen:
1014     * In the presence of send messages, totally interrupt dependency
1015     * control. They're long enough that the chance of dependency
1016     * control around them just doesn't matter.
1017     *
1018     * predicate:
1019     * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
1020     * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
1021     * completes the scoreboard clear must have a non-zero execution mask. This
1022     * means, if any kind of predication can change the execution mask or channel
1023     * enable of the last instruction, the optimization must be avoided. This is
1024     * to avoid instructions being shot down the pipeline when no writes are
1025     * required.
1026     *
1027     * math:
1028     * Dependency control does not work well over math instructions.
1029     * NB: Discovered empirically
1030     */
1031    return (inst->mlen || inst->predicate || inst->is_math());
1032 }
1033 
1034 /**
1035  * Sets the dependency control fields on instructions after register
1036  * allocation and before the generator is run.
1037  *
1038  * When you have a sequence of instructions like:
1039  *
1040  * DP4 temp.x vertex uniform[0]
1041  * DP4 temp.y vertex uniform[0]
1042  * DP4 temp.z vertex uniform[0]
1043  * DP4 temp.w vertex uniform[0]
1044  *
1045  * The hardware doesn't know that it can actually run the later instructions
1046  * while the previous ones are in flight, producing stalls.  However, we have
1047  * manual fields we can set in the instructions that let it do so.
1048  */
1049 void
opt_set_dependency_control()1050 vec4_visitor::opt_set_dependency_control()
1051 {
1052    vec4_instruction *last_grf_write[BRW_MAX_GRF];
1053    uint8_t grf_channels_written[BRW_MAX_GRF];
1054    vec4_instruction *last_mrf_write[BRW_MAX_GRF];
1055    uint8_t mrf_channels_written[BRW_MAX_GRF];
1056 
1057    assert(prog_data->total_grf ||
1058           !"Must be called after register allocation");
1059 
1060    foreach_block (block, cfg) {
1061       memset(last_grf_write, 0, sizeof(last_grf_write));
1062       memset(last_mrf_write, 0, sizeof(last_mrf_write));
1063 
1064       foreach_inst_in_block (vec4_instruction, inst, block) {
1065          /* If we read from a register that we were doing dependency control
1066           * on, don't do dependency control across the read.
1067           */
1068          for (int i = 0; i < 3; i++) {
1069             int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
1070             if (inst->src[i].file == VGRF) {
1071                last_grf_write[reg] = NULL;
1072             } else if (inst->src[i].file == FIXED_GRF) {
1073                memset(last_grf_write, 0, sizeof(last_grf_write));
1074                break;
1075             }
1076             assert(inst->src[i].file != MRF);
1077          }
1078 
1079          if (is_dep_ctrl_unsafe(inst)) {
1080             memset(last_grf_write, 0, sizeof(last_grf_write));
1081             memset(last_mrf_write, 0, sizeof(last_mrf_write));
1082             continue;
1083          }
1084 
1085          /* Now, see if we can do dependency control for this instruction
1086           * against a previous one writing to its destination.
1087           */
1088          int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
1089          if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
1090             if (last_grf_write[reg] &&
1091                 last_grf_write[reg]->dst.offset == inst->dst.offset &&
1092                 !(inst->dst.writemask & grf_channels_written[reg])) {
1093                last_grf_write[reg]->no_dd_clear = true;
1094                inst->no_dd_check = true;
1095             } else {
1096                grf_channels_written[reg] = 0;
1097             }
1098 
1099             last_grf_write[reg] = inst;
1100             grf_channels_written[reg] |= inst->dst.writemask;
1101          } else if (inst->dst.file == MRF) {
1102             if (last_mrf_write[reg] &&
1103                 last_mrf_write[reg]->dst.offset == inst->dst.offset &&
1104                 !(inst->dst.writemask & mrf_channels_written[reg])) {
1105                last_mrf_write[reg]->no_dd_clear = true;
1106                inst->no_dd_check = true;
1107             } else {
1108                mrf_channels_written[reg] = 0;
1109             }
1110 
1111             last_mrf_write[reg] = inst;
1112             mrf_channels_written[reg] |= inst->dst.writemask;
1113          }
1114       }
1115    }
1116 }
1117 
1118 bool
can_reswizzle(const struct gen_device_info * devinfo,int dst_writemask,int swizzle,int swizzle_mask)1119 vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
1120                                 int dst_writemask,
1121                                 int swizzle,
1122                                 int swizzle_mask)
1123 {
1124    /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
1125     * are not allowed.
1126     */
1127    if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
1128       return false;
1129 
1130    /* We can't swizzle implicit accumulator access.  We'd have to
1131     * reswizzle the producer of the accumulator value in addition
1132     * to the consumer (i.e. both MUL and MACH).  Just skip this.
1133     */
1134    if (reads_accumulator_implicitly())
1135       return false;
1136 
1137    if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
1138       return false;
1139 
1140    /* If this instruction sets anything not referenced by swizzle, then we'd
1141     * totally break it when we reswizzle.
1142     */
1143    if (dst.writemask & ~swizzle_mask)
1144       return false;
1145 
1146    if (mlen > 0)
1147       return false;
1148 
1149    for (int i = 0; i < 3; i++) {
1150       if (src[i].is_accumulator())
1151          return false;
1152    }
1153 
1154    return true;
1155 }
1156 
1157 /**
1158  * For any channels in the swizzle's source that were populated by this
1159  * instruction, rewrite the instruction to put the appropriate result directly
1160  * in those channels.
1161  *
1162  * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
1163  */
1164 void
reswizzle(int dst_writemask,int swizzle)1165 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
1166 {
1167    /* Destination write mask doesn't correspond to source swizzle for the dot
1168     * product and pack_bytes instructions.
1169     */
1170    if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
1171        opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
1172        opcode != VEC4_OPCODE_PACK_BYTES) {
1173       for (int i = 0; i < 3; i++) {
1174          if (src[i].file == BAD_FILE || src[i].file == IMM)
1175             continue;
1176 
1177          src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
1178       }
1179    }
1180 
1181    /* Apply the specified swizzle and writemask to the original mask of
1182     * written components.
1183     */
1184    dst.writemask = dst_writemask &
1185                    brw_apply_swizzle_to_mask(swizzle, dst.writemask);
1186 }
1187 
1188 /*
1189  * Tries to reduce extra MOV instructions by taking temporary GRFs that get
1190  * just written and then MOVed into another reg and making the original write
1191  * of the GRF write directly to the final destination instead.
1192  */
1193 bool
opt_register_coalesce()1194 vec4_visitor::opt_register_coalesce()
1195 {
1196    bool progress = false;
1197    int next_ip = 0;
1198 
1199    calculate_live_intervals();
1200 
1201    foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1202       int ip = next_ip;
1203       next_ip++;
1204 
1205       if (inst->opcode != BRW_OPCODE_MOV ||
1206           (inst->dst.file != VGRF && inst->dst.file != MRF) ||
1207 	  inst->predicate ||
1208 	  inst->src[0].file != VGRF ||
1209 	  inst->dst.type != inst->src[0].type ||
1210 	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1211 	 continue;
1212 
1213       /* Remove no-op MOVs */
1214       if (inst->dst.file == inst->src[0].file &&
1215           inst->dst.nr == inst->src[0].nr &&
1216           inst->dst.offset == inst->src[0].offset) {
1217          bool is_nop_mov = true;
1218 
1219          for (unsigned c = 0; c < 4; c++) {
1220             if ((inst->dst.writemask & (1 << c)) == 0)
1221                continue;
1222 
1223             if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1224                is_nop_mov = false;
1225                break;
1226             }
1227          }
1228 
1229          if (is_nop_mov) {
1230             inst->remove(block);
1231             progress = true;
1232             continue;
1233          }
1234       }
1235 
1236       bool to_mrf = (inst->dst.file == MRF);
1237 
1238       /* Can't coalesce this GRF if someone else was going to
1239        * read it later.
1240        */
1241       if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
1242 	 continue;
1243 
1244       /* We need to check interference with the final destination between this
1245        * instruction and the earliest instruction involved in writing the GRF
1246        * we're eliminating.  To do that, keep track of which of our source
1247        * channels we've seen initialized.
1248        */
1249       const unsigned chans_needed =
1250          brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1251                                        inst->dst.writemask);
1252       unsigned chans_remaining = chans_needed;
1253 
1254       /* Now walk up the instruction stream trying to see if we can rewrite
1255        * everything writing to the temporary to write into the destination
1256        * instead.
1257        */
1258       vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1259       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1260                                                   inst) {
1261          _scan_inst = scan_inst;
1262 
1263          if (regions_overlap(inst->src[0], inst->size_read(0),
1264                              scan_inst->dst, scan_inst->size_written)) {
1265             /* Found something writing to the reg we want to coalesce away. */
1266             if (to_mrf) {
1267                /* SEND instructions can't have MRF as a destination. */
1268                if (scan_inst->mlen)
1269                   break;
1270 
1271                if (devinfo->gen == 6) {
1272                   /* gen6 math instructions must have the destination be
1273                    * VGRF, so no compute-to-MRF for them.
1274                    */
1275                   if (scan_inst->is_math()) {
1276                      break;
1277                   }
1278                }
1279             }
1280 
1281             /* This doesn't handle saturation on the instruction we
1282              * want to coalesce away if the register types do not match.
1283              * But if scan_inst is a non type-converting 'mov', we can fix
1284              * the types later.
1285              */
1286             if (inst->saturate &&
1287                 inst->dst.type != scan_inst->dst.type &&
1288                 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1289                   scan_inst->dst.type == scan_inst->src[0].type))
1290                break;
1291 
1292             /* Only allow coalescing between registers of the same type size.
1293              * Otherwise we would need to make the pass aware of the fact that
1294              * channel sizes are different for single and double precision.
1295              */
1296             if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
1297                break;
1298 
1299             /* Check that scan_inst writes the same amount of data as the
1300              * instruction, otherwise coalescing would lead to writing a
1301              * different (larger or smaller) region of the destination
1302              */
1303             if (scan_inst->size_written != inst->size_written)
1304                break;
1305 
1306             /* If we can't handle the swizzle, bail. */
1307             if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1308                                           inst->src[0].swizzle,
1309                                           chans_needed)) {
1310                break;
1311             }
1312 
1313             /* This only handles coalescing writes of 8 channels (1 register
1314              * for single-precision and 2 registers for double-precision)
1315              * starting at the source offset of the copy instruction.
1316              */
1317             if (DIV_ROUND_UP(scan_inst->size_written,
1318                              type_sz(scan_inst->dst.type)) > 8 ||
1319                 scan_inst->dst.offset != inst->src[0].offset)
1320                break;
1321 
1322 	    /* Mark which channels we found unconditional writes for. */
1323 	    if (!scan_inst->predicate)
1324                chans_remaining &= ~scan_inst->dst.writemask;
1325 
1326 	    if (chans_remaining == 0)
1327 	       break;
1328 	 }
1329 
1330          /* You can't read from an MRF, so if someone else reads our MRF's
1331           * source GRF that we wanted to rewrite, that stops us.  If it's a
1332           * GRF we're trying to coalesce to, we don't actually handle
1333           * rewriting sources so bail in that case as well.
1334           */
1335 	 bool interfered = false;
1336 	 for (int i = 0; i < 3; i++) {
1337             if (regions_overlap(inst->src[0], inst->size_read(0),
1338                                 scan_inst->src[i], scan_inst->size_read(i)))
1339 	       interfered = true;
1340 	 }
1341 	 if (interfered)
1342 	    break;
1343 
1344          /* If somebody else writes the same channels of our destination here,
1345           * we can't coalesce before that.
1346           */
1347          if (regions_overlap(inst->dst, inst->size_written,
1348                              scan_inst->dst, scan_inst->size_written) &&
1349              (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1350             break;
1351          }
1352 
1353          /* Check for reads of the register we're trying to coalesce into.  We
1354           * can't go rewriting instructions above that to put some other value
1355           * in the register instead.
1356           */
1357          if (to_mrf && scan_inst->mlen > 0) {
1358             if (inst->dst.nr >= scan_inst->base_mrf &&
1359                 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
1360                break;
1361             }
1362          } else {
1363             for (int i = 0; i < 3; i++) {
1364                if (regions_overlap(inst->dst, inst->size_written,
1365                                    scan_inst->src[i], scan_inst->size_read(i)))
1366                   interfered = true;
1367             }
1368             if (interfered)
1369                break;
1370          }
1371       }
1372 
1373       if (chans_remaining == 0) {
1374 	 /* If we've made it here, we have an MOV we want to coalesce out, and
1375 	  * a scan_inst pointing to the earliest instruction involved in
1376 	  * computing the value.  Now go rewrite the instruction stream
1377 	  * between the two.
1378 	  */
1379          vec4_instruction *scan_inst = _scan_inst;
1380 	 while (scan_inst != inst) {
1381 	    if (scan_inst->dst.file == VGRF &&
1382                 scan_inst->dst.nr == inst->src[0].nr &&
1383 		scan_inst->dst.offset == inst->src[0].offset) {
1384                scan_inst->reswizzle(inst->dst.writemask,
1385                                     inst->src[0].swizzle);
1386 	       scan_inst->dst.file = inst->dst.file;
1387                scan_inst->dst.nr = inst->dst.nr;
1388 	       scan_inst->dst.offset = inst->dst.offset;
1389                if (inst->saturate &&
1390                    inst->dst.type != scan_inst->dst.type) {
1391                   /* If we have reached this point, scan_inst is a non
1392                    * type-converting 'mov' and we can modify its register types
1393                    * to match the ones in inst. Otherwise, we could have an
1394                    * incorrect saturation result.
1395                    */
1396                   scan_inst->dst.type = inst->dst.type;
1397                   scan_inst->src[0].type = inst->src[0].type;
1398                }
1399 	       scan_inst->saturate |= inst->saturate;
1400 	    }
1401 	    scan_inst = (vec4_instruction *)scan_inst->next;
1402 	 }
1403 	 inst->remove(block);
1404 	 progress = true;
1405       }
1406    }
1407 
1408    if (progress)
1409       invalidate_live_intervals();
1410 
1411    return progress;
1412 }
1413 
1414 /**
1415  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1416  * flow.  We could probably do better here with some form of divergence
1417  * analysis.
1418  */
1419 bool
eliminate_find_live_channel()1420 vec4_visitor::eliminate_find_live_channel()
1421 {
1422    bool progress = false;
1423    unsigned depth = 0;
1424 
1425    if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
1426       /* The optimization below assumes that channel zero is live on thread
1427        * dispatch, which may not be the case if the fixed function dispatches
1428        * threads sparsely.
1429        */
1430       return false;
1431    }
1432 
1433    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1434       switch (inst->opcode) {
1435       case BRW_OPCODE_IF:
1436       case BRW_OPCODE_DO:
1437          depth++;
1438          break;
1439 
1440       case BRW_OPCODE_ENDIF:
1441       case BRW_OPCODE_WHILE:
1442          depth--;
1443          break;
1444 
1445       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1446          if (depth == 0) {
1447             inst->opcode = BRW_OPCODE_MOV;
1448             inst->src[0] = brw_imm_d(0);
1449             inst->force_writemask_all = true;
1450             progress = true;
1451          }
1452          break;
1453 
1454       default:
1455          break;
1456       }
1457    }
1458 
1459    return progress;
1460 }
1461 
1462 /**
1463  * Splits virtual GRFs requesting more than one contiguous physical register.
1464  *
1465  * We initially create large virtual GRFs for temporary structures, arrays,
1466  * and matrices, so that the visitor functions can add offsets to work their
1467  * way down to the actual member being accessed.  But when it comes to
1468  * optimization, we'd like to treat each register as individual storage if
1469  * possible.
1470  *
1471  * So far, the only thing that might prevent splitting is a send message from
1472  * a GRF on IVB.
1473  */
1474 void
split_virtual_grfs()1475 vec4_visitor::split_virtual_grfs()
1476 {
1477    int num_vars = this->alloc.count;
1478    int new_virtual_grf[num_vars];
1479    bool split_grf[num_vars];
1480 
1481    memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1482 
1483    /* Try to split anything > 0 sized. */
1484    for (int i = 0; i < num_vars; i++) {
1485       split_grf[i] = this->alloc.sizes[i] != 1;
1486    }
1487 
1488    /* Check that the instructions are compatible with the registers we're trying
1489     * to split.
1490     */
1491    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1492       if (inst->dst.file == VGRF && regs_written(inst) > 1)
1493          split_grf[inst->dst.nr] = false;
1494 
1495       for (int i = 0; i < 3; i++) {
1496          if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
1497             split_grf[inst->src[i].nr] = false;
1498       }
1499    }
1500 
1501    /* Allocate new space for split regs.  Note that the virtual
1502     * numbers will be contiguous.
1503     */
1504    for (int i = 0; i < num_vars; i++) {
1505       if (!split_grf[i])
1506          continue;
1507 
1508       new_virtual_grf[i] = alloc.allocate(1);
1509       for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1510          unsigned reg = alloc.allocate(1);
1511          assert(reg == new_virtual_grf[i] + j - 1);
1512          (void) reg;
1513       }
1514       this->alloc.sizes[i] = 1;
1515    }
1516 
1517    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1518       if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1519           inst->dst.offset / REG_SIZE != 0) {
1520          inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1521                          inst->dst.offset / REG_SIZE - 1);
1522          inst->dst.offset %= REG_SIZE;
1523       }
1524       for (int i = 0; i < 3; i++) {
1525          if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1526              inst->src[i].offset / REG_SIZE != 0) {
1527             inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1528                                 inst->src[i].offset / REG_SIZE - 1);
1529             inst->src[i].offset %= REG_SIZE;
1530          }
1531       }
1532    }
1533    invalidate_live_intervals();
1534 }
1535 
1536 void
dump_instruction(backend_instruction * be_inst)1537 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1538 {
1539    dump_instruction(be_inst, stderr);
1540 }
1541 
1542 void
dump_instruction(backend_instruction * be_inst,FILE * file)1543 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1544 {
1545    vec4_instruction *inst = (vec4_instruction *)be_inst;
1546 
1547    if (inst->predicate) {
1548       fprintf(file, "(%cf0.%d%s) ",
1549               inst->predicate_inverse ? '-' : '+',
1550               inst->flag_subreg,
1551               pred_ctrl_align16[inst->predicate]);
1552    }
1553 
1554    fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
1555            inst->exec_size);
1556    if (inst->saturate)
1557       fprintf(file, ".sat");
1558    if (inst->conditional_mod) {
1559       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1560       if (!inst->predicate &&
1561           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1562                                 inst->opcode != BRW_OPCODE_IF &&
1563                                 inst->opcode != BRW_OPCODE_WHILE))) {
1564          fprintf(file, ".f0.%d", inst->flag_subreg);
1565       }
1566    }
1567    fprintf(file, " ");
1568 
1569    switch (inst->dst.file) {
1570    case VGRF:
1571       fprintf(file, "vgrf%d", inst->dst.nr);
1572       break;
1573    case FIXED_GRF:
1574       fprintf(file, "g%d", inst->dst.nr);
1575       break;
1576    case MRF:
1577       fprintf(file, "m%d", inst->dst.nr);
1578       break;
1579    case ARF:
1580       switch (inst->dst.nr) {
1581       case BRW_ARF_NULL:
1582          fprintf(file, "null");
1583          break;
1584       case BRW_ARF_ADDRESS:
1585          fprintf(file, "a0.%d", inst->dst.subnr);
1586          break;
1587       case BRW_ARF_ACCUMULATOR:
1588          fprintf(file, "acc%d", inst->dst.subnr);
1589          break;
1590       case BRW_ARF_FLAG:
1591          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1592          break;
1593       default:
1594          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1595          break;
1596       }
1597       break;
1598    case BAD_FILE:
1599       fprintf(file, "(null)");
1600       break;
1601    case IMM:
1602    case ATTR:
1603    case UNIFORM:
1604       unreachable("not reached");
1605    }
1606    if (inst->dst.offset ||
1607        (inst->dst.file == VGRF &&
1608         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
1609       const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
1610       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
1611               inst->dst.offset % reg_size);
1612    }
1613    if (inst->dst.writemask != WRITEMASK_XYZW) {
1614       fprintf(file, ".");
1615       if (inst->dst.writemask & 1)
1616          fprintf(file, "x");
1617       if (inst->dst.writemask & 2)
1618          fprintf(file, "y");
1619       if (inst->dst.writemask & 4)
1620          fprintf(file, "z");
1621       if (inst->dst.writemask & 8)
1622          fprintf(file, "w");
1623    }
1624    fprintf(file, ":%s", brw_reg_type_to_letters(inst->dst.type));
1625 
1626    if (inst->src[0].file != BAD_FILE)
1627       fprintf(file, ", ");
1628 
1629    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1630       if (inst->src[i].negate)
1631          fprintf(file, "-");
1632       if (inst->src[i].abs)
1633          fprintf(file, "|");
1634       switch (inst->src[i].file) {
1635       case VGRF:
1636          fprintf(file, "vgrf%d", inst->src[i].nr);
1637          break;
1638       case FIXED_GRF:
1639          fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
1640          break;
1641       case ATTR:
1642          fprintf(file, "attr%d", inst->src[i].nr);
1643          break;
1644       case UNIFORM:
1645          fprintf(file, "u%d", inst->src[i].nr);
1646          break;
1647       case IMM:
1648          switch (inst->src[i].type) {
1649          case BRW_REGISTER_TYPE_F:
1650             fprintf(file, "%fF", inst->src[i].f);
1651             break;
1652          case BRW_REGISTER_TYPE_DF:
1653             fprintf(file, "%fDF", inst->src[i].df);
1654             break;
1655          case BRW_REGISTER_TYPE_D:
1656             fprintf(file, "%dD", inst->src[i].d);
1657             break;
1658          case BRW_REGISTER_TYPE_UD:
1659             fprintf(file, "%uU", inst->src[i].ud);
1660             break;
1661          case BRW_REGISTER_TYPE_VF:
1662             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1663                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
1664                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
1665                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1666                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1667             break;
1668          default:
1669             fprintf(file, "???");
1670             break;
1671          }
1672          break;
1673       case ARF:
1674          switch (inst->src[i].nr) {
1675          case BRW_ARF_NULL:
1676             fprintf(file, "null");
1677             break;
1678          case BRW_ARF_ADDRESS:
1679             fprintf(file, "a0.%d", inst->src[i].subnr);
1680             break;
1681          case BRW_ARF_ACCUMULATOR:
1682             fprintf(file, "acc%d", inst->src[i].subnr);
1683             break;
1684          case BRW_ARF_FLAG:
1685             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1686             break;
1687          default:
1688             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1689             break;
1690          }
1691          break;
1692       case BAD_FILE:
1693          fprintf(file, "(null)");
1694          break;
1695       case MRF:
1696          unreachable("not reached");
1697       }
1698 
1699       if (inst->src[i].offset ||
1700           (inst->src[i].file == VGRF &&
1701            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
1702          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
1703          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
1704                  inst->src[i].offset % reg_size);
1705       }
1706 
1707       if (inst->src[i].file != IMM) {
1708          static const char *chans[4] = {"x", "y", "z", "w"};
1709          fprintf(file, ".");
1710          for (int c = 0; c < 4; c++) {
1711             fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1712          }
1713       }
1714 
1715       if (inst->src[i].abs)
1716          fprintf(file, "|");
1717 
1718       if (inst->src[i].file != IMM) {
1719          fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
1720       }
1721 
1722       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1723          fprintf(file, ", ");
1724    }
1725 
1726    if (inst->force_writemask_all)
1727       fprintf(file, " NoMask");
1728 
1729    if (inst->exec_size != 8)
1730       fprintf(file, " group%d", inst->group);
1731 
1732    fprintf(file, "\n");
1733 }
1734 
1735 
1736 int
setup_attributes(int payload_reg)1737 vec4_vs_visitor::setup_attributes(int payload_reg)
1738 {
1739    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1740       for (int i = 0; i < 3; i++) {
1741          if (inst->src[i].file == ATTR) {
1742             assert(inst->src[i].offset % REG_SIZE == 0);
1743             int grf = payload_reg + inst->src[i].nr +
1744                       inst->src[i].offset / REG_SIZE;
1745 
1746             struct brw_reg reg = brw_vec8_grf(grf, 0);
1747             reg.swizzle = inst->src[i].swizzle;
1748             reg.type = inst->src[i].type;
1749             reg.abs = inst->src[i].abs;
1750             reg.negate = inst->src[i].negate;
1751             inst->src[i] = reg;
1752          }
1753       }
1754    }
1755 
1756    return payload_reg + vs_prog_data->nr_attribute_slots;
1757 }
1758 
1759 int
setup_uniforms(int reg)1760 vec4_visitor::setup_uniforms(int reg)
1761 {
1762    prog_data->base.dispatch_grf_start_reg = reg;
1763 
1764    /* The pre-gen6 VS requires that some push constants get loaded no
1765     * matter what, or the GPU would hang.
1766     */
1767    if (devinfo->gen < 6 && this->uniforms == 0) {
1768       brw_stage_prog_data_add_params(stage_prog_data, 4);
1769       for (unsigned int i = 0; i < 4; i++) {
1770 	 unsigned int slot = this->uniforms * 4 + i;
1771 	 stage_prog_data->param[slot] = BRW_PARAM_BUILTIN_ZERO;
1772       }
1773 
1774       this->uniforms++;
1775       reg++;
1776    } else {
1777       reg += ALIGN(uniforms, 2) / 2;
1778    }
1779 
1780    for (int i = 0; i < 4; i++)
1781       reg += stage_prog_data->ubo_ranges[i].length;
1782 
1783    stage_prog_data->nr_params = this->uniforms * 4;
1784 
1785    prog_data->base.curb_read_length =
1786       reg - prog_data->base.dispatch_grf_start_reg;
1787 
1788    return reg;
1789 }
1790 
1791 void
setup_payload(void)1792 vec4_vs_visitor::setup_payload(void)
1793 {
1794    int reg = 0;
1795 
1796    /* The payload always contains important data in g0, which contains
1797     * the URB handles that are passed on to the URB write at the end
1798     * of the thread.  So, we always start push constants at g1.
1799     */
1800    reg++;
1801 
1802    reg = setup_uniforms(reg);
1803 
1804    reg = setup_attributes(reg);
1805 
1806    this->first_non_payload_grf = reg;
1807 }
1808 
1809 bool
lower_minmax()1810 vec4_visitor::lower_minmax()
1811 {
1812    assert(devinfo->gen < 6);
1813 
1814    bool progress = false;
1815 
1816    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1817       const vec4_builder ibld(this, block, inst);
1818 
1819       if (inst->opcode == BRW_OPCODE_SEL &&
1820           inst->predicate == BRW_PREDICATE_NONE) {
1821          /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
1822           *        the original SEL.L/GE instruction
1823           */
1824          ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
1825                   inst->conditional_mod);
1826          inst->predicate = BRW_PREDICATE_NORMAL;
1827          inst->conditional_mod = BRW_CONDITIONAL_NONE;
1828 
1829          progress = true;
1830       }
1831    }
1832 
1833    if (progress)
1834       invalidate_live_intervals();
1835 
1836    return progress;
1837 }
1838 
1839 src_reg
get_timestamp()1840 vec4_visitor::get_timestamp()
1841 {
1842    assert(devinfo->gen >= 7);
1843 
1844    src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1845                                 BRW_ARF_TIMESTAMP,
1846                                 0,
1847                                 0,
1848                                 0,
1849                                 BRW_REGISTER_TYPE_UD,
1850                                 BRW_VERTICAL_STRIDE_0,
1851                                 BRW_WIDTH_4,
1852                                 BRW_HORIZONTAL_STRIDE_4,
1853                                 BRW_SWIZZLE_XYZW,
1854                                 WRITEMASK_XYZW));
1855 
1856    dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1857 
1858    vec4_instruction *mov = emit(MOV(dst, ts));
1859    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1860     * even if it's not enabled in the dispatch.
1861     */
1862    mov->force_writemask_all = true;
1863 
1864    return src_reg(dst);
1865 }
1866 
1867 void
emit_shader_time_begin()1868 vec4_visitor::emit_shader_time_begin()
1869 {
1870    current_annotation = "shader time start";
1871    shader_start_time = get_timestamp();
1872 }
1873 
1874 void
emit_shader_time_end()1875 vec4_visitor::emit_shader_time_end()
1876 {
1877    current_annotation = "shader time end";
1878    src_reg shader_end_time = get_timestamp();
1879 
1880 
1881    /* Check that there weren't any timestamp reset events (assuming these
1882     * were the only two timestamp reads that happened).
1883     */
1884    src_reg reset_end = shader_end_time;
1885    reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1886    vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
1887    test->conditional_mod = BRW_CONDITIONAL_Z;
1888 
1889    emit(IF(BRW_PREDICATE_NORMAL));
1890 
1891    /* Take the current timestamp and get the delta. */
1892    shader_start_time.negate = true;
1893    dst_reg diff = dst_reg(this, glsl_type::uint_type);
1894    emit(ADD(diff, shader_start_time, shader_end_time));
1895 
1896    /* If there were no instructions between the two timestamp gets, the diff
1897     * is 2 cycles.  Remove that overhead, so I can forget about that when
1898     * trying to determine the time taken for single instructions.
1899     */
1900    emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
1901 
1902    emit_shader_time_write(0, src_reg(diff));
1903    emit_shader_time_write(1, brw_imm_ud(1u));
1904    emit(BRW_OPCODE_ELSE);
1905    emit_shader_time_write(2, brw_imm_ud(1u));
1906    emit(BRW_OPCODE_ENDIF);
1907 }
1908 
1909 void
emit_shader_time_write(int shader_time_subindex,src_reg value)1910 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1911 {
1912    dst_reg dst =
1913       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1914 
1915    dst_reg offset = dst;
1916    dst_reg time = dst;
1917    time.offset += REG_SIZE;
1918 
1919    offset.type = BRW_REGISTER_TYPE_UD;
1920    int index = shader_time_index * 3 + shader_time_subindex;
1921    emit(MOV(offset, brw_imm_d(index * BRW_SHADER_TIME_STRIDE)));
1922 
1923    time.type = BRW_REGISTER_TYPE_UD;
1924    emit(MOV(time, value));
1925 
1926    vec4_instruction *inst =
1927       emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1928    inst->mlen = 2;
1929 }
1930 
1931 static bool
is_align1_df(vec4_instruction * inst)1932 is_align1_df(vec4_instruction *inst)
1933 {
1934    switch (inst->opcode) {
1935    case VEC4_OPCODE_DOUBLE_TO_F32:
1936    case VEC4_OPCODE_DOUBLE_TO_D32:
1937    case VEC4_OPCODE_DOUBLE_TO_U32:
1938    case VEC4_OPCODE_TO_DOUBLE:
1939    case VEC4_OPCODE_PICK_LOW_32BIT:
1940    case VEC4_OPCODE_PICK_HIGH_32BIT:
1941    case VEC4_OPCODE_SET_LOW_32BIT:
1942    case VEC4_OPCODE_SET_HIGH_32BIT:
1943       return true;
1944    default:
1945       return false;
1946    }
1947 }
1948 
1949 /**
1950  * Three source instruction must have a GRF/MRF destination register.
1951  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
1952  */
1953 void
fixup_3src_null_dest()1954 vec4_visitor::fixup_3src_null_dest()
1955 {
1956    bool progress = false;
1957 
1958    foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1959       if (inst->is_3src(devinfo) && inst->dst.is_null()) {
1960          const unsigned size_written = type_sz(inst->dst.type);
1961          const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
1962 
1963          inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
1964                             inst->dst.type);
1965          progress = true;
1966       }
1967    }
1968 
1969    if (progress)
1970       invalidate_live_intervals();
1971 }
1972 
1973 void
convert_to_hw_regs()1974 vec4_visitor::convert_to_hw_regs()
1975 {
1976    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1977       for (int i = 0; i < 3; i++) {
1978          class src_reg &src = inst->src[i];
1979          struct brw_reg reg;
1980          switch (src.file) {
1981          case VGRF: {
1982             reg = byte_offset(brw_vecn_grf(4, src.nr, 0), src.offset);
1983             reg.type = src.type;
1984             reg.abs = src.abs;
1985             reg.negate = src.negate;
1986             break;
1987          }
1988 
1989          case UNIFORM: {
1990             reg = stride(byte_offset(brw_vec4_grf(
1991                                         prog_data->base.dispatch_grf_start_reg +
1992                                         src.nr / 2, src.nr % 2 * 4),
1993                                      src.offset),
1994                          0, 4, 1);
1995             reg.type = src.type;
1996             reg.abs = src.abs;
1997             reg.negate = src.negate;
1998 
1999             /* This should have been moved to pull constants. */
2000             assert(!src.reladdr);
2001             break;
2002          }
2003 
2004          case FIXED_GRF:
2005             if (type_sz(src.type) == 8) {
2006                reg = src.as_brw_reg();
2007                break;
2008             }
2009             /* fallthrough */
2010          case ARF:
2011          case IMM:
2012             continue;
2013 
2014          case BAD_FILE:
2015             /* Probably unused. */
2016             reg = brw_null_reg();
2017             reg = retype(reg, src.type);
2018             break;
2019 
2020          case MRF:
2021          case ATTR:
2022             unreachable("not reached");
2023          }
2024 
2025          apply_logical_swizzle(&reg, inst, i);
2026          src = reg;
2027 
2028          /* From IVB PRM, vol4, part3, "General Restrictions on Regioning
2029           * Parameters":
2030           *
2031           *   "If ExecSize = Width and HorzStride ≠ 0, VertStride must be set
2032           *    to Width * HorzStride."
2033           *
2034           * We can break this rule with DF sources on DF align1
2035           * instructions, because the exec_size would be 4 and width is 4.
2036           * As we know we are not accessing to next GRF, it is safe to
2037           * set vstride to the formula given by the rule itself.
2038           */
2039          if (is_align1_df(inst) && (cvt(inst->exec_size) - 1) == src.width)
2040             src.vstride = src.width + src.hstride;
2041       }
2042 
2043       if (inst->is_3src(devinfo)) {
2044          /* 3-src instructions with scalar sources support arbitrary subnr,
2045           * but don't actually use swizzles.  Convert swizzle into subnr.
2046           * Skip this for double-precision instructions: RepCtrl=1 is not
2047           * allowed for them and needs special handling.
2048           */
2049          for (int i = 0; i < 3; i++) {
2050             if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
2051                 type_sz(inst->src[i].type) < 8) {
2052                assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
2053                inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
2054             }
2055          }
2056       }
2057 
2058       dst_reg &dst = inst->dst;
2059       struct brw_reg reg;
2060 
2061       switch (inst->dst.file) {
2062       case VGRF:
2063          reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset);
2064          reg.type = dst.type;
2065          reg.writemask = dst.writemask;
2066          break;
2067 
2068       case MRF:
2069          reg = byte_offset(brw_message_reg(dst.nr), dst.offset);
2070          assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
2071          reg.type = dst.type;
2072          reg.writemask = dst.writemask;
2073          break;
2074 
2075       case ARF:
2076       case FIXED_GRF:
2077          reg = dst.as_brw_reg();
2078          break;
2079 
2080       case BAD_FILE:
2081          reg = brw_null_reg();
2082          reg = retype(reg, dst.type);
2083          break;
2084 
2085       case IMM:
2086       case ATTR:
2087       case UNIFORM:
2088          unreachable("not reached");
2089       }
2090 
2091       dst = reg;
2092    }
2093 }
2094 
2095 static bool
stage_uses_interleaved_attributes(unsigned stage,enum shader_dispatch_mode dispatch_mode)2096 stage_uses_interleaved_attributes(unsigned stage,
2097                                   enum shader_dispatch_mode dispatch_mode)
2098 {
2099    switch (stage) {
2100    case MESA_SHADER_TESS_EVAL:
2101       return true;
2102    case MESA_SHADER_GEOMETRY:
2103       return dispatch_mode != DISPATCH_MODE_4X2_DUAL_OBJECT;
2104    default:
2105       return false;
2106    }
2107 }
2108 
2109 /**
2110  * Get the closest native SIMD width supported by the hardware for instruction
2111  * \p inst.  The instruction will be left untouched by
2112  * vec4_visitor::lower_simd_width() if the returned value matches the
2113  * instruction's original execution size.
2114  */
2115 static unsigned
get_lowered_simd_width(const struct gen_device_info * devinfo,enum shader_dispatch_mode dispatch_mode,unsigned stage,const vec4_instruction * inst)2116 get_lowered_simd_width(const struct gen_device_info *devinfo,
2117                        enum shader_dispatch_mode dispatch_mode,
2118                        unsigned stage, const vec4_instruction *inst)
2119 {
2120    /* Do not split some instructions that require special handling */
2121    switch (inst->opcode) {
2122    case SHADER_OPCODE_GEN4_SCRATCH_READ:
2123    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
2124       return inst->exec_size;
2125    default:
2126       break;
2127    }
2128 
2129    unsigned lowered_width = MIN2(16, inst->exec_size);
2130 
2131    /* We need to split some cases of double-precision instructions that write
2132     * 2 registers. We only need to care about this in gen7 because that is the
2133     * only hardware that implements fp64 in Align16.
2134     */
2135    if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
2136       /* Align16 8-wide double-precision SEL does not work well. Verified
2137        * empirically.
2138        */
2139       if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
2140          lowered_width = MIN2(lowered_width, 4);
2141 
2142       /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
2143        * Register Addressing:
2144        *
2145        *    "When destination spans two registers, the source MUST span two
2146        *     registers."
2147        */
2148       for (unsigned i = 0; i < 3; i++) {
2149          if (inst->src[i].file == BAD_FILE)
2150             continue;
2151          if (inst->size_read(i) <= REG_SIZE)
2152             lowered_width = MIN2(lowered_width, 4);
2153 
2154          /* Interleaved attribute setups use a vertical stride of 0, which
2155           * makes them hit the associated instruction decompression bug in gen7.
2156           * Split them to prevent this.
2157           */
2158          if (inst->src[i].file == ATTR &&
2159              stage_uses_interleaved_attributes(stage, dispatch_mode))
2160             lowered_width = MIN2(lowered_width, 4);
2161       }
2162    }
2163 
2164    /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since
2165     * it doesn't support compression in Align16 mode, no matter if it has
2166     * force_writemask_all enabled or disabled (the latter is affected by the
2167     * compressed instruction bug in gen7, which is another reason to enforce
2168     * this limit).
2169     */
2170    if (devinfo->gen == 7 && !devinfo->is_haswell &&
2171        (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8))
2172       lowered_width = MIN2(lowered_width, 4);
2173 
2174    return lowered_width;
2175 }
2176 
2177 static bool
dst_src_regions_overlap(vec4_instruction * inst)2178 dst_src_regions_overlap(vec4_instruction *inst)
2179 {
2180    if (inst->size_written == 0)
2181       return false;
2182 
2183    unsigned dst_start = inst->dst.offset;
2184    unsigned dst_end = dst_start + inst->size_written - 1;
2185    for (int i = 0; i < 3; i++) {
2186       if (inst->src[i].file == BAD_FILE)
2187          continue;
2188 
2189       if (inst->dst.file != inst->src[i].file ||
2190           inst->dst.nr != inst->src[i].nr)
2191          continue;
2192 
2193       unsigned src_start = inst->src[i].offset;
2194       unsigned src_end = src_start + inst->size_read(i) - 1;
2195 
2196       if ((dst_start >= src_start && dst_start <= src_end) ||
2197           (dst_end >= src_start && dst_end <= src_end) ||
2198           (dst_start <= src_start && dst_end >= src_end)) {
2199          return true;
2200       }
2201    }
2202 
2203    return false;
2204 }
2205 
2206 bool
lower_simd_width()2207 vec4_visitor::lower_simd_width()
2208 {
2209    bool progress = false;
2210 
2211    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2212       const unsigned lowered_width =
2213          get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
2214       assert(lowered_width <= inst->exec_size);
2215       if (lowered_width == inst->exec_size)
2216          continue;
2217 
2218       /* We need to deal with source / destination overlaps when splitting.
2219        * The hardware supports reading from and writing to the same register
2220        * in the same instruction, but we need to be careful that each split
2221        * instruction we produce does not corrupt the source of the next.
2222        *
2223        * The easiest way to handle this is to make the split instructions write
2224        * to temporaries if there is an src/dst overlap and then move from the
2225        * temporaries to the original destination. We also need to consider
2226        * instructions that do partial writes via align1 opcodes, in which case
2227        * we need to make sure that the we initialize the temporary with the
2228        * value of the instruction's dst.
2229        */
2230       bool needs_temp = dst_src_regions_overlap(inst);
2231       for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
2232          unsigned channel_offset = lowered_width * n;
2233 
2234          unsigned size_written = lowered_width * type_sz(inst->dst.type);
2235 
2236          /* Create the split instruction from the original so that we copy all
2237           * relevant instruction fields, then set the width and calculate the
2238           * new dst/src regions.
2239           */
2240          vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
2241          linst->exec_size = lowered_width;
2242          linst->group = channel_offset;
2243          linst->size_written = size_written;
2244 
2245          /* Compute split dst region */
2246          dst_reg dst;
2247          if (needs_temp) {
2248             unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
2249             dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
2250                          inst->dst.type);
2251             if (inst->is_align1_partial_write()) {
2252                vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
2253                copy->exec_size = lowered_width;
2254                copy->group = channel_offset;
2255                copy->size_written = size_written;
2256                inst->insert_before(block, copy);
2257             }
2258          } else {
2259             dst = horiz_offset(inst->dst, channel_offset);
2260          }
2261          linst->dst = dst;
2262 
2263          /* Compute split source regions */
2264          for (int i = 0; i < 3; i++) {
2265             if (linst->src[i].file == BAD_FILE)
2266                continue;
2267 
2268             bool is_interleaved_attr =
2269                linst->src[i].file == ATTR &&
2270                stage_uses_interleaved_attributes(stage,
2271                                                  prog_data->dispatch_mode);
2272 
2273             if (!is_uniform(linst->src[i]) && !is_interleaved_attr)
2274                linst->src[i] = horiz_offset(linst->src[i], channel_offset);
2275          }
2276 
2277          inst->insert_before(block, linst);
2278 
2279          /* If we used a temporary to store the result of the split
2280           * instruction, copy the result to the original destination
2281           */
2282          if (needs_temp) {
2283             vec4_instruction *mov =
2284                MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
2285             mov->exec_size = lowered_width;
2286             mov->group = channel_offset;
2287             mov->size_written = size_written;
2288             mov->predicate = inst->predicate;
2289             inst->insert_before(block, mov);
2290          }
2291       }
2292 
2293       inst->remove(block);
2294       progress = true;
2295    }
2296 
2297    if (progress)
2298       invalidate_live_intervals();
2299 
2300    return progress;
2301 }
2302 
2303 static brw_predicate
scalarize_predicate(brw_predicate predicate,unsigned writemask)2304 scalarize_predicate(brw_predicate predicate, unsigned writemask)
2305 {
2306    if (predicate != BRW_PREDICATE_NORMAL)
2307       return predicate;
2308 
2309    switch (writemask) {
2310    case WRITEMASK_X:
2311       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
2312    case WRITEMASK_Y:
2313       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
2314    case WRITEMASK_Z:
2315       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
2316    case WRITEMASK_W:
2317       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
2318    default:
2319       unreachable("invalid writemask");
2320    }
2321 }
2322 
2323 /* Gen7 has a hardware decompression bug that we can exploit to represent
2324  * handful of additional swizzles natively.
2325  */
2326 static bool
is_gen7_supported_64bit_swizzle(vec4_instruction * inst,unsigned arg)2327 is_gen7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
2328 {
2329    switch (inst->src[arg].swizzle) {
2330    case BRW_SWIZZLE_XXXX:
2331    case BRW_SWIZZLE_YYYY:
2332    case BRW_SWIZZLE_ZZZZ:
2333    case BRW_SWIZZLE_WWWW:
2334    case BRW_SWIZZLE_XYXY:
2335    case BRW_SWIZZLE_YXYX:
2336    case BRW_SWIZZLE_ZWZW:
2337    case BRW_SWIZZLE_WZWZ:
2338       return true;
2339    default:
2340       return false;
2341    }
2342 }
2343 
2344 /* 64-bit sources use regions with a width of 2. These 2 elements in each row
2345  * can be addressed using 32-bit swizzles (which is what the hardware supports)
2346  * but it also means that the swizzle we apply on the first two components of a
2347  * dvec4 is coupled with the swizzle we use for the last 2. In other words,
2348  * only some specific swizzle combinations can be natively supported.
2349  *
2350  * FIXME: we can go an step further and implement even more swizzle
2351  *        variations using only partial scalarization.
2352  *
2353  * For more details see:
2354  * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
2355  */
2356 bool
is_supported_64bit_region(vec4_instruction * inst,unsigned arg)2357 vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
2358 {
2359    const src_reg &src = inst->src[arg];
2360    assert(type_sz(src.type) == 8);
2361 
2362    /* Uniform regions have a vstride=0. Because we use 2-wide rows with
2363     * 64-bit regions it means that we cannot access components Z/W, so
2364     * return false for any such case. Interleaved attributes will also be
2365     * mapped to GRF registers with a vstride of 0, so apply the same
2366     * treatment.
2367     */
2368    if ((is_uniform(src) ||
2369         (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
2370          src.file == ATTR)) &&
2371        (brw_mask_for_swizzle(src.swizzle) & 12))
2372       return false;
2373 
2374    switch (src.swizzle) {
2375    case BRW_SWIZZLE_XYZW:
2376    case BRW_SWIZZLE_XXZZ:
2377    case BRW_SWIZZLE_YYWW:
2378    case BRW_SWIZZLE_YXWZ:
2379       return true;
2380    default:
2381       return devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg);
2382    }
2383 }
2384 
2385 bool
scalarize_df()2386 vec4_visitor::scalarize_df()
2387 {
2388    bool progress = false;
2389 
2390    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2391       /* Skip DF instructions that operate in Align1 mode */
2392       if (is_align1_df(inst))
2393          continue;
2394 
2395       /* Check if this is a double-precision instruction */
2396       bool is_double = type_sz(inst->dst.type) == 8;
2397       for (int arg = 0; !is_double && arg < 3; arg++) {
2398          is_double = inst->src[arg].file != BAD_FILE &&
2399                      type_sz(inst->src[arg].type) == 8;
2400       }
2401 
2402       if (!is_double)
2403          continue;
2404 
2405       /* Skip the lowering for specific regioning scenarios that we can
2406        * support natively.
2407        */
2408       bool skip_lowering = true;
2409 
2410       /* XY and ZW writemasks operate in 32-bit, which means that they don't
2411        * have a native 64-bit representation and they should always be split.
2412        */
2413       if (inst->dst.writemask == WRITEMASK_XY ||
2414           inst->dst.writemask == WRITEMASK_ZW) {
2415          skip_lowering = false;
2416       } else {
2417          for (unsigned i = 0; i < 3; i++) {
2418             if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
2419                continue;
2420             skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
2421          }
2422       }
2423 
2424       if (skip_lowering)
2425          continue;
2426 
2427       /* Generate scalar instructions for each enabled channel */
2428       for (unsigned chan = 0; chan < 4; chan++) {
2429          unsigned chan_mask = 1 << chan;
2430          if (!(inst->dst.writemask & chan_mask))
2431             continue;
2432 
2433          vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
2434 
2435          for (unsigned i = 0; i < 3; i++) {
2436             unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
2437             scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
2438          }
2439 
2440          scalar_inst->dst.writemask = chan_mask;
2441 
2442          if (inst->predicate != BRW_PREDICATE_NONE) {
2443             scalar_inst->predicate =
2444                scalarize_predicate(inst->predicate, chan_mask);
2445          }
2446 
2447          inst->insert_before(block, scalar_inst);
2448       }
2449 
2450       inst->remove(block);
2451       progress = true;
2452    }
2453 
2454    if (progress)
2455       invalidate_live_intervals();
2456 
2457    return progress;
2458 }
2459 
2460 bool
lower_64bit_mad_to_mul_add()2461 vec4_visitor::lower_64bit_mad_to_mul_add()
2462 {
2463    bool progress = false;
2464 
2465    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2466       if (inst->opcode != BRW_OPCODE_MAD)
2467          continue;
2468 
2469       if (type_sz(inst->dst.type) != 8)
2470          continue;
2471 
2472       dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
2473 
2474       /* Use the copy constructor so we copy all relevant instruction fields
2475        * from the original mad into the add and mul instructions
2476        */
2477       vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
2478       mul->opcode = BRW_OPCODE_MUL;
2479       mul->dst = mul_dst;
2480       mul->src[0] = inst->src[1];
2481       mul->src[1] = inst->src[2];
2482       mul->src[2].file = BAD_FILE;
2483 
2484       vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
2485       add->opcode = BRW_OPCODE_ADD;
2486       add->src[0] = src_reg(mul_dst);
2487       add->src[1] = inst->src[0];
2488       add->src[2].file = BAD_FILE;
2489 
2490       inst->insert_before(block, mul);
2491       inst->insert_before(block, add);
2492       inst->remove(block);
2493 
2494       progress = true;
2495    }
2496 
2497    if (progress)
2498       invalidate_live_intervals();
2499 
2500    return progress;
2501 }
2502 
2503 /* The align16 hardware can only do 32-bit swizzle channels, so we need to
2504  * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
2505  * to 32-bit swizzle channels in hardware registers.
2506  *
2507  * @inst and @arg identify the original vec4 IR source operand we need to
2508  * translate the swizzle for and @hw_reg is the hardware register where we
2509  * will write the hardware swizzle to use.
2510  *
2511  * This pass assumes that Align16/DF instructions have been fully scalarized
2512  * previously so there is just one 64-bit swizzle channel to deal with for any
2513  * given Vec4 IR source.
2514  */
2515 void
apply_logical_swizzle(struct brw_reg * hw_reg,vec4_instruction * inst,int arg)2516 vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
2517                                     vec4_instruction *inst, int arg)
2518 {
2519    src_reg reg = inst->src[arg];
2520 
2521    if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
2522       return;
2523 
2524    /* If this is not a 64-bit operand or this is a scalar instruction we don't
2525     * need to do anything about the swizzles.
2526     */
2527    if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
2528       hw_reg->swizzle = reg.swizzle;
2529       return;
2530    }
2531 
2532    /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
2533    assert(brw_is_single_value_swizzle(reg.swizzle) ||
2534           is_supported_64bit_region(inst, arg));
2535 
2536    /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16
2537     * HW can only do 32-bit swizzle channels.
2538     */
2539    hw_reg->width = BRW_WIDTH_2;
2540 
2541    if (is_supported_64bit_region(inst, arg) &&
2542        !is_gen7_supported_64bit_swizzle(inst, arg)) {
2543       /* Supported 64-bit swizzles are those such that their first two
2544        * components, when expanded to 32-bit swizzles, match the semantics
2545        * of the original 64-bit swizzle with 2-wide row regioning.
2546        */
2547       unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
2548       unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
2549       hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2550                                      swizzle1 * 2, swizzle1 * 2 + 1);
2551    } else {
2552       /* If we got here then we have one of the following:
2553        *
2554        * 1. An unsupported swizzle, which should be single-value thanks to the
2555        *    scalarization pass.
2556        *
2557        * 2. A gen7 supported swizzle. These can be single-value or double-value
2558        *    swizzles. If the latter, they are never cross-dvec2 channels. For
2559        *    these we always need to activate the gen7 vstride=0 exploit.
2560        */
2561       unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
2562       unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
2563       assert((swizzle0 < 2) == (swizzle1 < 2));
2564 
2565       /* To gain access to Z/W components we need to select the second half
2566        * of the register and then use a X/Y swizzle to select Z/W respectively.
2567        */
2568       if (swizzle0 >= 2) {
2569          *hw_reg = suboffset(*hw_reg, 2);
2570          swizzle0 -= 2;
2571          swizzle1 -= 2;
2572       }
2573 
2574       /* All gen7-specific supported swizzles require the vstride=0 exploit */
2575       if (devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg))
2576          hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
2577 
2578       /* Any 64-bit source with an offset at 16B is intended to address the
2579        * second half of a register and needs a vertical stride of 0 so we:
2580        *
2581        * 1. Don't violate register region restrictions.
2582        * 2. Activate the gen7 instruction decompresion bug exploit when
2583        *    execsize > 4
2584        */
2585       if (hw_reg->subnr % REG_SIZE == 16) {
2586          assert(devinfo->gen == 7);
2587          hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
2588       }
2589 
2590       hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2591                                      swizzle1 * 2, swizzle1 * 2 + 1);
2592    }
2593 }
2594 
2595 bool
run()2596 vec4_visitor::run()
2597 {
2598    if (shader_time_index >= 0)
2599       emit_shader_time_begin();
2600 
2601    emit_prolog();
2602 
2603    emit_nir_code();
2604    if (failed)
2605       return false;
2606    base_ir = NULL;
2607 
2608    emit_thread_end();
2609 
2610    calculate_cfg();
2611 
2612    /* Before any optimization, push array accesses out to scratch
2613     * space where we need them to be.  This pass may allocate new
2614     * virtual GRFs, so we want to do it early.  It also makes sure
2615     * that we have reladdr computations available for CSE, since we'll
2616     * often do repeated subexpressions for those.
2617     */
2618    move_grf_array_access_to_scratch();
2619    move_uniform_array_access_to_pull_constants();
2620 
2621    pack_uniform_registers();
2622    move_push_constants_to_pull_constants();
2623    split_virtual_grfs();
2624 
2625 #define OPT(pass, args...) ({                                          \
2626       pass_num++;                                                      \
2627       bool this_progress = pass(args);                                 \
2628                                                                        \
2629       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
2630          char filename[64];                                            \
2631          snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass,              \
2632                   stage_abbrev, nir->info.name, iteration, pass_num); \
2633                                                                        \
2634          backend_shader::dump_instructions(filename);                  \
2635       }                                                                \
2636                                                                        \
2637       progress = progress || this_progress;                            \
2638       this_progress;                                                   \
2639    })
2640 
2641 
2642    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
2643       char filename[64];
2644       snprintf(filename, 64, "%s-%s-00-00-start",
2645                stage_abbrev, nir->info.name);
2646 
2647       backend_shader::dump_instructions(filename);
2648    }
2649 
2650    bool progress;
2651    int iteration = 0;
2652    int pass_num = 0;
2653    do {
2654       progress = false;
2655       pass_num = 0;
2656       iteration++;
2657 
2658       OPT(opt_predicated_break, this);
2659       OPT(opt_reduce_swizzle);
2660       OPT(dead_code_eliminate);
2661       OPT(dead_control_flow_eliminate, this);
2662       OPT(opt_copy_propagation);
2663       OPT(opt_cmod_propagation);
2664       OPT(opt_cse);
2665       OPT(opt_algebraic);
2666       OPT(opt_register_coalesce);
2667       OPT(eliminate_find_live_channel);
2668    } while (progress);
2669 
2670    pass_num = 0;
2671 
2672    if (OPT(opt_vector_float)) {
2673       OPT(opt_cse);
2674       OPT(opt_copy_propagation, false);
2675       OPT(opt_copy_propagation, true);
2676       OPT(dead_code_eliminate);
2677    }
2678 
2679    if (devinfo->gen <= 5 && OPT(lower_minmax)) {
2680       OPT(opt_cmod_propagation);
2681       OPT(opt_cse);
2682       OPT(opt_copy_propagation);
2683       OPT(dead_code_eliminate);
2684    }
2685 
2686    if (OPT(lower_simd_width)) {
2687       OPT(opt_copy_propagation);
2688       OPT(dead_code_eliminate);
2689    }
2690 
2691    if (failed)
2692       return false;
2693 
2694    OPT(lower_64bit_mad_to_mul_add);
2695 
2696    /* Run this before payload setup because tesselation shaders
2697     * rely on it to prevent cross dvec2 regioning on DF attributes
2698     * that are setup so that XY are on the second half of register and
2699     * ZW are in the first half of the next.
2700     */
2701    OPT(scalarize_df);
2702 
2703    setup_payload();
2704 
2705    if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
2706       /* Debug of register spilling: Go spill everything. */
2707       const int grf_count = alloc.count;
2708       float spill_costs[alloc.count];
2709       bool no_spill[alloc.count];
2710       evaluate_spill_costs(spill_costs, no_spill);
2711       for (int i = 0; i < grf_count; i++) {
2712          if (no_spill[i])
2713             continue;
2714          spill_reg(i);
2715       }
2716 
2717       /* We want to run this after spilling because 64-bit (un)spills need to
2718        * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2719        * messages that can produce unsupported 64-bit swizzle regions.
2720        */
2721       OPT(scalarize_df);
2722    }
2723 
2724    fixup_3src_null_dest();
2725 
2726    bool allocated_without_spills = reg_allocate();
2727 
2728    if (!allocated_without_spills) {
2729       compiler->shader_perf_log(log_data,
2730                                 "%s shader triggered register spilling.  "
2731                                 "Try reducing the number of live vec4 values "
2732                                 "to improve performance.\n",
2733                                 stage_name);
2734 
2735       while (!reg_allocate()) {
2736          if (failed)
2737             return false;
2738       }
2739 
2740       /* We want to run this after spilling because 64-bit (un)spills need to
2741        * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2742        * messages that can produce unsupported 64-bit swizzle regions.
2743        */
2744       OPT(scalarize_df);
2745    }
2746 
2747    opt_schedule_instructions();
2748 
2749    opt_set_dependency_control();
2750 
2751    convert_to_hw_regs();
2752 
2753    if (last_scratch > 0) {
2754       prog_data->base.total_scratch =
2755          brw_get_scratch_size(last_scratch * REG_SIZE);
2756    }
2757 
2758    return !failed;
2759 }
2760 
2761 } /* namespace brw */
2762 
2763 extern "C" {
2764 
2765 /**
2766  * Compile a vertex shader.
2767  *
2768  * Returns the final assembly and the program's size.
2769  */
2770 const unsigned *
brw_compile_vs(const struct brw_compiler * compiler,void * log_data,void * mem_ctx,const struct brw_vs_prog_key * key,struct brw_vs_prog_data * prog_data,const nir_shader * src_shader,int shader_time_index,char ** error_str)2771 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
2772                void *mem_ctx,
2773                const struct brw_vs_prog_key *key,
2774                struct brw_vs_prog_data *prog_data,
2775                const nir_shader *src_shader,
2776                int shader_time_index,
2777                char **error_str)
2778 {
2779    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
2780    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
2781    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
2782 
2783    const unsigned *assembly = NULL;
2784 
2785    if (prog_data->base.vue_map.varying_to_slot[VARYING_SLOT_EDGE] != -1) {
2786       /* If the output VUE map contains VARYING_SLOT_EDGE then we need to copy
2787        * the edge flag from VERT_ATTRIB_EDGEFLAG.  This will be done
2788        * automatically by brw_vec4_visitor::emit_urb_slot but we need to
2789        * ensure that prog_data->inputs_read is accurate.
2790        *
2791        * In order to make late NIR passes aware of the change, we actually
2792        * whack shader->info.inputs_read instead.  This is safe because we just
2793        * made a copy of the shader.
2794        */
2795       assert(!is_scalar);
2796       assert(key->copy_edgeflag);
2797       shader->info.inputs_read |= VERT_BIT_EDGEFLAG;
2798    }
2799 
2800    prog_data->inputs_read = shader->info.inputs_read;
2801    prog_data->double_inputs_read = shader->info.double_inputs_read;
2802 
2803    brw_nir_lower_vs_inputs(shader, key->gl_attrib_wa_flags);
2804    brw_nir_lower_vue_outputs(shader, is_scalar);
2805    shader = brw_postprocess_nir(shader, compiler, is_scalar);
2806 
2807    prog_data->base.clip_distance_mask =
2808       ((1 << shader->info.clip_distance_array_size) - 1);
2809    prog_data->base.cull_distance_mask =
2810       ((1 << shader->info.cull_distance_array_size) - 1) <<
2811       shader->info.clip_distance_array_size;
2812 
2813    unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
2814 
2815    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2816     * incoming vertex attribute.  So, add an extra slot.
2817     */
2818    if (shader->info.system_values_read &
2819        (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
2820         BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
2821         BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
2822         BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
2823       nr_attribute_slots++;
2824    }
2825 
2826    if (shader->info.system_values_read &
2827        BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX))
2828       prog_data->uses_basevertex = true;
2829 
2830    if (shader->info.system_values_read &
2831        BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE))
2832       prog_data->uses_baseinstance = true;
2833 
2834    if (shader->info.system_values_read &
2835        BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
2836       prog_data->uses_vertexid = true;
2837 
2838    if (shader->info.system_values_read &
2839        BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))
2840       prog_data->uses_instanceid = true;
2841 
2842    /* gl_DrawID has its very own vec4 */
2843    if (shader->info.system_values_read &
2844        BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
2845       prog_data->uses_drawid = true;
2846       nr_attribute_slots++;
2847    }
2848 
2849    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2850     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
2851     * vec4 mode, the hardware appears to wedge unless we read something.
2852     */
2853    if (is_scalar)
2854       prog_data->base.urb_read_length =
2855          DIV_ROUND_UP(nr_attribute_slots, 2);
2856    else
2857       prog_data->base.urb_read_length =
2858          DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
2859 
2860    prog_data->nr_attribute_slots = nr_attribute_slots;
2861 
2862    /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2863     * (overwriting the original contents), we need to make sure the size is
2864     * the larger of the two.
2865     */
2866    const unsigned vue_entries =
2867       MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
2868 
2869    if (compiler->devinfo->gen == 6) {
2870       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2871    } else {
2872       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2873       /* On Cannonlake software shall not program an allocation size that
2874        * specifies a size that is a multiple of 3 64B (512-bit) cachelines.
2875        */
2876       if (compiler->devinfo->gen == 10 &&
2877           prog_data->base.urb_entry_size % 3 == 0)
2878          prog_data->base.urb_entry_size++;
2879    }
2880 
2881    if (INTEL_DEBUG & DEBUG_VS) {
2882       fprintf(stderr, "VS Output ");
2883       brw_print_vue_map(stderr, &prog_data->base.vue_map);
2884    }
2885 
2886    if (is_scalar) {
2887       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2888 
2889       fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2890                    NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2891                    shader, 8, shader_time_index);
2892       if (!v.run_vs()) {
2893          if (error_str)
2894             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2895 
2896          return NULL;
2897       }
2898 
2899       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
2900 
2901       fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2902                      &prog_data->base.base, v.promoted_constants,
2903                      v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
2904       if (INTEL_DEBUG & DEBUG_VS) {
2905          const char *debug_name =
2906             ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2907                             shader->info.label ? shader->info.label :
2908                                "unnamed",
2909                             shader->info.name);
2910 
2911          g.enable_debug(debug_name);
2912       }
2913       g.generate_code(v.cfg, 8);
2914       assembly = g.get_assembly(&prog_data->base.base.program_size);
2915    }
2916 
2917    if (!assembly) {
2918       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2919 
2920       vec4_vs_visitor v(compiler, log_data, key, prog_data,
2921                         shader, mem_ctx, shader_time_index);
2922       if (!v.run()) {
2923          if (error_str)
2924             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2925 
2926          return NULL;
2927       }
2928 
2929       assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2930                                             shader, &prog_data->base, v.cfg,
2931                                             &prog_data->base.base.program_size);
2932    }
2933 
2934    return assembly;
2935 }
2936 
2937 } /* extern "C" */
2938