1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 
31 namespace brw {
32    /**
33     * Toolbox to assemble an FS IR program out of individual instructions.
34     *
35     * This object is meant to have an interface consistent with
36     * brw::vec4_builder.  They cannot be fully interchangeable because
37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
38     * vector code.
39     */
40    class fs_builder {
41    public:
42       /** Type used in this IR to represent a source of an instruction. */
43       typedef fs_reg src_reg;
44 
45       /** Type used in this IR to represent the destination of an instruction. */
46       typedef fs_reg dst_reg;
47 
48       /** Type used in this IR to represent an instruction. */
49       typedef fs_inst instruction;
50 
51       /**
52        * Construct an fs_builder that inserts instructions into \p shader.
53        * \p dispatch_width gives the native execution width of the program.
54        */
fs_builder(backend_shader * shader,unsigned dispatch_width)55       fs_builder(backend_shader *shader,
56                  unsigned dispatch_width) :
57          shader(shader), block(NULL), cursor(NULL),
58          _dispatch_width(dispatch_width),
59          _group(0),
60          force_writemask_all(false),
61          annotation()
62       {
63       }
64 
65       /**
66        * Construct an fs_builder that inserts instructions into \p shader
67        * before instruction \p inst in basic block \p block.  The default
68        * execution controls and debug annotation are initialized from the
69        * instruction passed as argument.
70        */
fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72          shader(shader), block(block), cursor(inst),
73          _dispatch_width(inst->exec_size),
74          _group(inst->group),
75          force_writemask_all(inst->force_writemask_all)
76       {
77          annotation.str = inst->annotation;
78          annotation.ir = inst->ir;
79       }
80 
81       /**
82        * Construct an fs_builder that inserts instructions before \p cursor in
83        * basic block \p block, inheriting other code generation parameters
84        * from this.
85        */
86       fs_builder
at(bblock_t * block,exec_node * cursor)87       at(bblock_t *block, exec_node *cursor) const
88       {
89          fs_builder bld = *this;
90          bld.block = block;
91          bld.cursor = cursor;
92          return bld;
93       }
94 
95       /**
96        * Construct an fs_builder appending instructions at the end of the
97        * instruction list of the shader, inheriting other code generation
98        * parameters from this.
99        */
100       fs_builder
at_end()101       at_end() const
102       {
103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104       }
105 
106       /**
107        * Construct a builder specifying the default SIMD width and group of
108        * channel enable signals, inheriting other code generation parameters
109        * from this.
110        *
111        * \p n gives the default SIMD width, \p i gives the slot group used for
112        * predication and control flow masking in multiples of \p n channels.
113        */
114       fs_builder
group(unsigned n,unsigned i)115       group(unsigned n, unsigned i) const
116       {
117          fs_builder bld = *this;
118 
119          if (n <= dispatch_width() && i < dispatch_width() / n) {
120             bld._group += i * n;
121          } else {
122             /* The requested channel group isn't a subset of the channel group
123              * of this builder, which means that the resulting instructions
124              * would use (potentially undefined) channel enable signals not
125              * specified by the parent builder.  That's only valid if the
126              * instruction doesn't have per-channel semantics, in which case
127              * we should clear off the default group index in order to prevent
128              * emitting instructions with channel group not aligned to their
129              * own execution size.
130              */
131             assert(force_writemask_all);
132             bld._group = 0;
133          }
134 
135          bld._dispatch_width = n;
136          return bld;
137       }
138 
139       /**
140        * Alias for group() with width equal to eight.
141        */
142       fs_builder
quarter(unsigned i)143       quarter(unsigned i) const
144       {
145          return group(8, i);
146       }
147 
148       /**
149        * Construct a builder with per-channel control flow execution masking
150        * disabled if \p b is true.  If control flow execution masking is
151        * already disabled this has no effect.
152        */
153       fs_builder
154       exec_all(bool b = true) const
155       {
156          fs_builder bld = *this;
157          if (b)
158             bld.force_writemask_all = true;
159          return bld;
160       }
161 
162       /**
163        * Construct a builder with the given debug annotation info.
164        */
165       fs_builder
166       annotate(const char *str, const void *ir = NULL) const
167       {
168          fs_builder bld = *this;
169          bld.annotation.str = str;
170          bld.annotation.ir = ir;
171          return bld;
172       }
173 
174       /**
175        * Get the SIMD width in use.
176        */
177       unsigned
dispatch_width()178       dispatch_width() const
179       {
180          return _dispatch_width;
181       }
182 
183       /**
184        * Get the channel group in use.
185        */
186       unsigned
group()187       group() const
188       {
189          return _group;
190       }
191 
192       /**
193        * Allocate a virtual register of natural vector size (one for this IR)
194        * and SIMD width.  \p n gives the amount of space to allocate in
195        * dispatch_width units (which is just enough space for one logical
196        * component in this IR).
197        */
198       dst_reg
199       vgrf(enum brw_reg_type type, unsigned n = 1) const
200       {
201          assert(dispatch_width() <= 32);
202 
203          if (n > 0)
204             return dst_reg(VGRF, shader->alloc.allocate(
205                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206                                            REG_SIZE)),
207                            type);
208          else
209             return retype(null_reg_ud(), type);
210       }
211 
212       /**
213        * Create a null register of floating type.
214        */
215       dst_reg
null_reg_f()216       null_reg_f() const
217       {
218          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219       }
220 
221       dst_reg
null_reg_df()222       null_reg_df() const
223       {
224          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225       }
226 
227       /**
228        * Create a null register of signed integer type.
229        */
230       dst_reg
null_reg_d()231       null_reg_d() const
232       {
233          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234       }
235 
236       /**
237        * Create a null register of unsigned integer type.
238        */
239       dst_reg
null_reg_ud()240       null_reg_ud() const
241       {
242          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243       }
244 
245       /**
246        * Insert an instruction into the program.
247        */
248       instruction *
emit(const instruction & inst)249       emit(const instruction &inst) const
250       {
251          return emit(new(shader->mem_ctx) instruction(inst));
252       }
253 
254       /**
255        * Create and insert a nullary control instruction into the program.
256        */
257       instruction *
emit(enum opcode opcode)258       emit(enum opcode opcode) const
259       {
260          return emit(instruction(opcode, dispatch_width()));
261       }
262 
263       /**
264        * Create and insert a nullary instruction into the program.
265        */
266       instruction *
emit(enum opcode opcode,const dst_reg & dst)267       emit(enum opcode opcode, const dst_reg &dst) const
268       {
269          return emit(instruction(opcode, dispatch_width(), dst));
270       }
271 
272       /**
273        * Create and insert a unary instruction into the program.
274        */
275       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)276       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277       {
278          switch (opcode) {
279          case SHADER_OPCODE_RCP:
280          case SHADER_OPCODE_RSQ:
281          case SHADER_OPCODE_SQRT:
282          case SHADER_OPCODE_EXP2:
283          case SHADER_OPCODE_LOG2:
284          case SHADER_OPCODE_SIN:
285          case SHADER_OPCODE_COS:
286             return emit(instruction(opcode, dispatch_width(), dst,
287                                     fix_math_operand(src0)));
288 
289          default:
290             return emit(instruction(opcode, dispatch_width(), dst, src0));
291          }
292       }
293 
294       /**
295        * Create and insert a binary instruction into the program.
296        */
297       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)298       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299            const src_reg &src1) const
300       {
301          switch (opcode) {
302          case SHADER_OPCODE_POW:
303          case SHADER_OPCODE_INT_QUOTIENT:
304          case SHADER_OPCODE_INT_REMAINDER:
305             return emit(instruction(opcode, dispatch_width(), dst,
306                                     fix_math_operand(src0),
307                                     fix_math_operand(src1)));
308 
309          default:
310             return emit(instruction(opcode, dispatch_width(), dst,
311                                     src0, src1));
312 
313          }
314       }
315 
316       /**
317        * Create and insert a ternary instruction into the program.
318        */
319       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)320       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321            const src_reg &src1, const src_reg &src2) const
322       {
323          switch (opcode) {
324          case BRW_OPCODE_BFE:
325          case BRW_OPCODE_BFI2:
326          case BRW_OPCODE_MAD:
327          case BRW_OPCODE_LRP:
328             return emit(instruction(opcode, dispatch_width(), dst,
329                                     fix_3src_operand(src0),
330                                     fix_3src_operand(src1),
331                                     fix_3src_operand(src2)));
332 
333          default:
334             return emit(instruction(opcode, dispatch_width(), dst,
335                                     src0, src1, src2));
336          }
337       }
338 
339       /**
340        * Create and insert an instruction with a variable number of sources
341        * into the program.
342        */
343       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)344       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345            unsigned n) const
346       {
347          /* Use the emit() methods for specific operand counts to ensure that
348           * opcode-specific operand fixups occur.
349           */
350          if (n == 2) {
351             return emit(opcode, dst, srcs[0], srcs[1]);
352          } else if (n == 3) {
353             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354          } else {
355             return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356          }
357       }
358 
359       /**
360        * Insert a preallocated instruction into the program.
361        */
362       instruction *
emit(instruction * inst)363       emit(instruction *inst) const
364       {
365          assert(inst->exec_size <= 32);
366          assert(inst->exec_size == dispatch_width() ||
367                 force_writemask_all);
368 
369          inst->group = _group;
370          inst->force_writemask_all = force_writemask_all;
371          inst->annotation = annotation.str;
372          inst->ir = annotation.ir;
373 
374          if (block)
375             static_cast<instruction *>(cursor)->insert_before(block, inst);
376          else
377             cursor->insert_before(inst);
378 
379          return inst;
380       }
381 
382       /**
383        * Select \p src0 if the comparison of both sources with the given
384        * conditional mod evaluates to true, otherwise select \p src1.
385        *
386        * Generally useful to get the minimum or maximum of two values.
387        */
388       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)389       emit_minmax(const dst_reg &dst, const src_reg &src0,
390                   const src_reg &src1, brw_conditional_mod mod) const
391       {
392          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393 
394          /* In some cases we can't have bytes as operand for src1, so use the
395           * same type for both operand.
396           */
397          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
398                                      fix_unsigned_negate(src1)));
399       }
400 
401       /**
402        * Copy any live channel from \p src to the first channel of the result.
403        */
404       src_reg
emit_uniformize(const src_reg & src)405       emit_uniformize(const src_reg &src) const
406       {
407          /* FIXME: We use a vector chan_index and dst to allow constant and
408           * copy propagration to move result all the way into the consuming
409           * instruction (typically a surface index or sampler index for a
410           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411           * dispatch. Once we teach const/copy propagation about scalars we
412           * should go back to scalar destinations here.
413           */
414          const fs_builder ubld = exec_all();
415          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416          const dst_reg dst = vgrf(src.type);
417 
418          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420 
421          return src_reg(component(dst, 0));
422       }
423 
424       src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)425       move_to_vgrf(const src_reg &src, unsigned num_components) const
426       {
427          src_reg *const src_comps = new src_reg[num_components];
428          for (unsigned i = 0; i < num_components; i++)
429             src_comps[i] = offset(src, dispatch_width(), i);
430 
431          const dst_reg dst = vgrf(src.type, num_components);
432          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433 
434          delete[] src_comps;
435 
436          return src_reg(dst);
437       }
438 
439       void
emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)440       emit_scan(enum opcode opcode, const dst_reg &tmp,
441                 unsigned cluster_size, brw_conditional_mod mod) const
442       {
443          assert(dispatch_width() >= 8);
444 
445          /* The instruction splitting code isn't advanced enough to split
446           * these so we need to handle that ourselves.
447           */
448          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
449             const unsigned half_width = dispatch_width() / 2;
450             const fs_builder ubld = exec_all().group(half_width, 0);
451             dst_reg left = tmp;
452             dst_reg right = horiz_offset(tmp, half_width);
453             ubld.emit_scan(opcode, left, cluster_size, mod);
454             ubld.emit_scan(opcode, right, cluster_size, mod);
455             if (cluster_size > half_width) {
456                src_reg left_comp = component(left, half_width - 1);
457                set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
458             }
459             return;
460          }
461 
462          if (cluster_size > 1) {
463             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
464             const dst_reg left = horiz_stride(tmp, 2);
465             const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
466             set_condmod(mod, ubld.emit(opcode, right, left, right));
467          }
468 
469          if (cluster_size > 2) {
470             if (type_sz(tmp.type) <= 4) {
471                const fs_builder ubld =
472                   exec_all().group(dispatch_width() / 4, 0);
473                src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
474 
475                dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
476                set_condmod(mod, ubld.emit(opcode, right, left, right));
477 
478                right = horiz_stride(horiz_offset(tmp, 3), 4);
479                set_condmod(mod, ubld.emit(opcode, right, left, right));
480             } else {
481                /* For 64-bit types, we have to do things differently because
482                 * the code above would land us with destination strides that
483                 * the hardware can't handle.  Fortunately, we'll only be
484                 * 8-wide in that case and it's the same number of
485                 * instructions.
486                 */
487                const fs_builder ubld = exec_all().group(2, 0);
488 
489                for (unsigned i = 0; i < dispatch_width(); i += 4) {
490                   src_reg left = component(tmp, i + 1);
491                   dst_reg right = horiz_offset(tmp, i + 2);
492                   set_condmod(mod, ubld.emit(opcode, right, left, right));
493                }
494             }
495          }
496 
497          for (unsigned i = 4;
498               i < MIN2(cluster_size, dispatch_width());
499               i *= 2) {
500             const fs_builder ubld = exec_all().group(i, 0);
501             src_reg left = component(tmp, i - 1);
502             dst_reg right = horiz_offset(tmp, i);
503             set_condmod(mod, ubld.emit(opcode, right, left, right));
504 
505             if (dispatch_width() > i * 2) {
506                left = component(tmp, i * 3 - 1);
507                right = horiz_offset(tmp, i * 3);
508                set_condmod(mod, ubld.emit(opcode, right, left, right));
509             }
510 
511             if (dispatch_width() > i * 4) {
512                left = component(tmp, i * 5 - 1);
513                right = horiz_offset(tmp, i * 5);
514                set_condmod(mod, ubld.emit(opcode, right, left, right));
515 
516                left = component(tmp, i * 7 - 1);
517                right = horiz_offset(tmp, i * 7);
518                set_condmod(mod, ubld.emit(opcode, right, left, right));
519             }
520          }
521       }
522 
523       /**
524        * Assorted arithmetic ops.
525        * @{
526        */
527 #define ALU1(op)                                        \
528       instruction *                                     \
529       op(const dst_reg &dst, const src_reg &src0) const \
530       {                                                 \
531          return emit(BRW_OPCODE_##op, dst, src0);       \
532       }
533 
534 #define ALU2(op)                                                        \
535       instruction *                                                     \
536       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
537       {                                                                 \
538          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
539       }
540 
541 #define ALU2_ACC(op)                                                    \
542       instruction *                                                     \
543       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
544       {                                                                 \
545          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
546          inst->writes_accumulator = true;                               \
547          return inst;                                                   \
548       }
549 
550 #define ALU3(op)                                                        \
551       instruction *                                                     \
552       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
553          const src_reg &src2) const                                     \
554       {                                                                 \
555          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
556       }
557 
558       ALU2(ADD)
ALU2_ACC(ADDC)559       ALU2_ACC(ADDC)
560       ALU2(AND)
561       ALU2(ASR)
562       ALU2(AVG)
563       ALU3(BFE)
564       ALU2(BFI1)
565       ALU3(BFI2)
566       ALU1(BFREV)
567       ALU1(CBIT)
568       ALU2(CMPN)
569       ALU1(DIM)
570       ALU2(DP2)
571       ALU2(DP3)
572       ALU2(DP4)
573       ALU2(DPH)
574       ALU1(F16TO32)
575       ALU1(F32TO16)
576       ALU1(FBH)
577       ALU1(FBL)
578       ALU1(FRC)
579       ALU2(LINE)
580       ALU1(LZD)
581       ALU2(MAC)
582       ALU2_ACC(MACH)
583       ALU3(MAD)
584       ALU1(MOV)
585       ALU2(MUL)
586       ALU1(NOT)
587       ALU2(OR)
588       ALU2(PLN)
589       ALU1(RNDD)
590       ALU1(RNDE)
591       ALU1(RNDU)
592       ALU1(RNDZ)
593       ALU2(ROL)
594       ALU2(ROR)
595       ALU2(SAD2)
596       ALU2_ACC(SADA2)
597       ALU2(SEL)
598       ALU2(SHL)
599       ALU2(SHR)
600       ALU2_ACC(SUBB)
601       ALU2(XOR)
602 
603 #undef ALU3
604 #undef ALU2_ACC
605 #undef ALU2
606 #undef ALU1
607       /** @} */
608 
609       /**
610        * CMP: Sets the low bit of the destination channels with the result
611        * of the comparison, while the upper bits are undefined, and updates
612        * the flag register with the packed 16 bits of the result.
613        */
614       instruction *
615       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
616           brw_conditional_mod condition) const
617       {
618          /* Take the instruction:
619           *
620           * CMP null<d> src0<f> src1<f>
621           *
622           * Original gen4 does type conversion to the destination type
623           * before comparison, producing garbage results for floating
624           * point comparisons.
625           *
626           * The destination type doesn't matter on newer generations,
627           * so we set the type to match src0 so we can compact the
628           * instruction.
629           */
630          return set_condmod(condition,
631                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
632                                  fix_unsigned_negate(src0),
633                                  fix_unsigned_negate(src1)));
634       }
635 
636       /**
637        * Gen4 predicated IF.
638        */
639       instruction *
IF(brw_predicate predicate)640       IF(brw_predicate predicate) const
641       {
642          return set_predicate(predicate, emit(BRW_OPCODE_IF));
643       }
644 
645       /**
646        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
647        */
648       instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)649       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
650            const src_reg &src2, brw_conditional_mod condition) const
651       {
652          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
653           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
654           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
655           */
656          assert(src2.type == BRW_REGISTER_TYPE_F);
657 
658          return set_condmod(condition,
659                             emit(BRW_OPCODE_CSEL,
660                                  retype(dst, BRW_REGISTER_TYPE_F),
661                                  retype(src0, BRW_REGISTER_TYPE_F),
662                                  retype(src1, BRW_REGISTER_TYPE_F),
663                                  src2));
664       }
665 
666       /**
667        * Emit a linear interpolation instruction.
668        */
669       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)670       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
671           const src_reg &a) const
672       {
673          if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
674             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
675              * we need to reorder the operands.
676              */
677             return emit(BRW_OPCODE_LRP, dst, a, y, x);
678 
679          } else {
680             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
681             const dst_reg y_times_a = vgrf(dst.type);
682             const dst_reg one_minus_a = vgrf(dst.type);
683             const dst_reg x_times_one_minus_a = vgrf(dst.type);
684 
685             MUL(y_times_a, y, a);
686             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
687             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
688             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
689          }
690       }
691 
692       /**
693        * Collect a number of registers in a contiguous range of registers.
694        */
695       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)696       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
697                    unsigned sources, unsigned header_size) const
698       {
699          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
700          inst->header_size = header_size;
701          inst->size_written = header_size * REG_SIZE;
702          for (unsigned i = header_size; i < sources; i++) {
703             inst->size_written +=
704                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
705                      REG_SIZE);
706          }
707 
708          return inst;
709       }
710 
711       instruction *
UNDEF(const dst_reg & dst)712       UNDEF(const dst_reg &dst) const
713       {
714          assert(dst.file == VGRF);
715          instruction *inst = emit(SHADER_OPCODE_UNDEF,
716                                   retype(dst, BRW_REGISTER_TYPE_UD));
717          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
718 
719          return inst;
720       }
721 
722       backend_shader *shader;
723 
724    private:
725       /**
726        * Workaround for negation of UD registers.  See comment in
727        * fs_generator::generate_code() for more details.
728        */
729       src_reg
fix_unsigned_negate(const src_reg & src)730       fix_unsigned_negate(const src_reg &src) const
731       {
732          if (src.type == BRW_REGISTER_TYPE_UD &&
733              src.negate) {
734             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
735             MOV(temp, src);
736             return src_reg(temp);
737          } else {
738             return src;
739          }
740       }
741 
742       /**
743        * Workaround for source register modes not supported by the ternary
744        * instruction encoding.
745        */
746       src_reg
fix_3src_operand(const src_reg & src)747       fix_3src_operand(const src_reg &src) const
748       {
749          switch (src.file) {
750          case FIXED_GRF:
751             /* FINISHME: Could handle scalar region, other stride=1 regions */
752             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
753                 src.width != BRW_WIDTH_8 ||
754                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
755                break;
756             /* fallthrough */
757          case ATTR:
758          case VGRF:
759          case UNIFORM:
760          case IMM:
761             return src;
762          default:
763             break;
764          }
765 
766          dst_reg expanded = vgrf(src.type);
767          MOV(expanded, src);
768          return expanded;
769       }
770 
771       /**
772        * Workaround for source register modes not supported by the math
773        * instruction.
774        */
775       src_reg
fix_math_operand(const src_reg & src)776       fix_math_operand(const src_reg &src) const
777       {
778          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
779           * might be able to do better by doing execsize = 1 math and then
780           * expanding that result out, but we would need to be careful with
781           * masking.
782           *
783           * Gen6 hardware ignores source modifiers (negate and abs) on math
784           * instructions, so we also move to a temp to set those up.
785           *
786           * Gen7 relaxes most of the above restrictions, but still can't use IMM
787           * operands to math
788           */
789          if ((shader->devinfo->gen == 6 &&
790               (src.file == IMM || src.file == UNIFORM ||
791                src.abs || src.negate)) ||
792              (shader->devinfo->gen == 7 && src.file == IMM)) {
793             const dst_reg tmp = vgrf(src.type);
794             MOV(tmp, src);
795             return tmp;
796          } else {
797             return src;
798          }
799       }
800 
801       bblock_t *block;
802       exec_node *cursor;
803 
804       unsigned _dispatch_width;
805       unsigned _group;
806       bool force_writemask_all;
807 
808       /** Debug annotation info. */
809       struct {
810          const char *str;
811          const void *ir;
812       } annotation;
813    };
814 }
815 
816 #endif
817