1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 
31 namespace brw {
32    /**
33     * Toolbox to assemble an FS IR program out of individual instructions.
34     *
35     * This object is meant to have an interface consistent with
36     * brw::vec4_builder.  They cannot be fully interchangeable because
37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
38     * vector code.
39     */
40    class fs_builder {
41    public:
42       /** Type used in this IR to represent a source of an instruction. */
43       typedef fs_reg src_reg;
44 
45       /** Type used in this IR to represent the destination of an instruction. */
46       typedef fs_reg dst_reg;
47 
48       /** Type used in this IR to represent an instruction. */
49       typedef fs_inst instruction;
50 
51       /**
52        * Construct an fs_builder that inserts instructions into \p shader.
53        * \p dispatch_width gives the native execution width of the program.
54        */
fs_builder(backend_shader * shader,unsigned dispatch_width)55       fs_builder(backend_shader *shader,
56                  unsigned dispatch_width) :
57          shader(shader), block(NULL), cursor(NULL),
58          _dispatch_width(dispatch_width),
59          _group(0),
60          force_writemask_all(false),
61          annotation()
62       {
63       }
64 
65       /**
66        * Construct an fs_builder that inserts instructions into \p shader
67        * before instruction \p inst in basic block \p block.  The default
68        * execution controls and debug annotation are initialized from the
69        * instruction passed as argument.
70        */
fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72          shader(shader), block(block), cursor(inst),
73          _dispatch_width(inst->exec_size),
74          _group(inst->group),
75          force_writemask_all(inst->force_writemask_all)
76       {
77          annotation.str = inst->annotation;
78          annotation.ir = inst->ir;
79       }
80 
81       /**
82        * Construct an fs_builder that inserts instructions before \p cursor in
83        * basic block \p block, inheriting other code generation parameters
84        * from this.
85        */
86       fs_builder
at(bblock_t * block,exec_node * cursor)87       at(bblock_t *block, exec_node *cursor) const
88       {
89          fs_builder bld = *this;
90          bld.block = block;
91          bld.cursor = cursor;
92          return bld;
93       }
94 
95       /**
96        * Construct an fs_builder appending instructions at the end of the
97        * instruction list of the shader, inheriting other code generation
98        * parameters from this.
99        */
100       fs_builder
at_end()101       at_end() const
102       {
103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104       }
105 
106       /**
107        * Construct a builder specifying the default SIMD width and group of
108        * channel enable signals, inheriting other code generation parameters
109        * from this.
110        *
111        * \p n gives the default SIMD width, \p i gives the slot group used for
112        * predication and control flow masking in multiples of \p n channels.
113        */
114       fs_builder
group(unsigned n,unsigned i)115       group(unsigned n, unsigned i) const
116       {
117          assert(force_writemask_all ||
118                 (n <= dispatch_width() && i < dispatch_width() / n));
119          fs_builder bld = *this;
120          bld._dispatch_width = n;
121          bld._group += i * n;
122          return bld;
123       }
124 
125       /**
126        * Alias for group() with width equal to eight.
127        */
128       fs_builder
half(unsigned i)129       half(unsigned i) const
130       {
131          return group(8, i);
132       }
133 
134       /**
135        * Construct a builder with per-channel control flow execution masking
136        * disabled if \p b is true.  If control flow execution masking is
137        * already disabled this has no effect.
138        */
139       fs_builder
140       exec_all(bool b = true) const
141       {
142          fs_builder bld = *this;
143          if (b)
144             bld.force_writemask_all = true;
145          return bld;
146       }
147 
148       /**
149        * Construct a builder with the given debug annotation info.
150        */
151       fs_builder
152       annotate(const char *str, const void *ir = NULL) const
153       {
154          fs_builder bld = *this;
155          bld.annotation.str = str;
156          bld.annotation.ir = ir;
157          return bld;
158       }
159 
160       /**
161        * Get the SIMD width in use.
162        */
163       unsigned
dispatch_width()164       dispatch_width() const
165       {
166          return _dispatch_width;
167       }
168 
169       /**
170        * Get the channel group in use.
171        */
172       unsigned
group()173       group() const
174       {
175          return _group;
176       }
177 
178       /**
179        * Allocate a virtual register of natural vector size (one for this IR)
180        * and SIMD width.  \p n gives the amount of space to allocate in
181        * dispatch_width units (which is just enough space for one logical
182        * component in this IR).
183        */
184       dst_reg
185       vgrf(enum brw_reg_type type, unsigned n = 1) const
186       {
187          assert(dispatch_width() <= 32);
188 
189          if (n > 0)
190             return dst_reg(VGRF, shader->alloc.allocate(
191                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
192                                            REG_SIZE)),
193                            type);
194          else
195             return retype(null_reg_ud(), type);
196       }
197 
198       /**
199        * Create a null register of floating type.
200        */
201       dst_reg
null_reg_f()202       null_reg_f() const
203       {
204          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
205       }
206 
207       dst_reg
null_reg_df()208       null_reg_df() const
209       {
210          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
211       }
212 
213       /**
214        * Create a null register of signed integer type.
215        */
216       dst_reg
null_reg_d()217       null_reg_d() const
218       {
219          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
220       }
221 
222       /**
223        * Create a null register of unsigned integer type.
224        */
225       dst_reg
null_reg_ud()226       null_reg_ud() const
227       {
228          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
229       }
230 
231       /**
232        * Get the mask of SIMD channels enabled by dispatch and not yet
233        * disabled by discard.
234        */
235       src_reg
sample_mask_reg()236       sample_mask_reg() const
237       {
238          assert(shader->stage != MESA_SHADER_FRAGMENT ||
239                 group() + dispatch_width() <= 16);
240          if (shader->stage != MESA_SHADER_FRAGMENT) {
241             return brw_imm_d(0xffffffff);
242          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
243             return brw_flag_reg(0, 1);
244          } else {
245             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
246          }
247       }
248 
249       /**
250        * Insert an instruction into the program.
251        */
252       instruction *
emit(const instruction & inst)253       emit(const instruction &inst) const
254       {
255          return emit(new(shader->mem_ctx) instruction(inst));
256       }
257 
258       /**
259        * Create and insert a nullary control instruction into the program.
260        */
261       instruction *
emit(enum opcode opcode)262       emit(enum opcode opcode) const
263       {
264          return emit(instruction(opcode, dispatch_width()));
265       }
266 
267       /**
268        * Create and insert a nullary instruction into the program.
269        */
270       instruction *
emit(enum opcode opcode,const dst_reg & dst)271       emit(enum opcode opcode, const dst_reg &dst) const
272       {
273          return emit(instruction(opcode, dispatch_width(), dst));
274       }
275 
276       /**
277        * Create and insert a unary instruction into the program.
278        */
279       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)280       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
281       {
282          switch (opcode) {
283          case SHADER_OPCODE_RCP:
284          case SHADER_OPCODE_RSQ:
285          case SHADER_OPCODE_SQRT:
286          case SHADER_OPCODE_EXP2:
287          case SHADER_OPCODE_LOG2:
288          case SHADER_OPCODE_SIN:
289          case SHADER_OPCODE_COS:
290             return emit(instruction(opcode, dispatch_width(), dst,
291                                     fix_math_operand(src0)));
292 
293          default:
294             return emit(instruction(opcode, dispatch_width(), dst, src0));
295          }
296       }
297 
298       /**
299        * Create and insert a binary instruction into the program.
300        */
301       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)302       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
303            const src_reg &src1) const
304       {
305          switch (opcode) {
306          case SHADER_OPCODE_POW:
307          case SHADER_OPCODE_INT_QUOTIENT:
308          case SHADER_OPCODE_INT_REMAINDER:
309             return emit(instruction(opcode, dispatch_width(), dst,
310                                     fix_math_operand(src0),
311                                     fix_math_operand(src1)));
312 
313          default:
314             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
315 
316          }
317       }
318 
319       /**
320        * Create and insert a ternary instruction into the program.
321        */
322       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)323       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
324            const src_reg &src1, const src_reg &src2) const
325       {
326          switch (opcode) {
327          case BRW_OPCODE_BFE:
328          case BRW_OPCODE_BFI2:
329          case BRW_OPCODE_MAD:
330          case BRW_OPCODE_LRP:
331             return emit(instruction(opcode, dispatch_width(), dst,
332                                     fix_3src_operand(src0),
333                                     fix_3src_operand(src1),
334                                     fix_3src_operand(src2)));
335 
336          default:
337             return emit(instruction(opcode, dispatch_width(), dst,
338                                     src0, src1, src2));
339          }
340       }
341 
342       /**
343        * Create and insert an instruction with a variable number of sources
344        * into the program.
345        */
346       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)347       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
348            unsigned n) const
349       {
350          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
351       }
352 
353       /**
354        * Insert a preallocated instruction into the program.
355        */
356       instruction *
emit(instruction * inst)357       emit(instruction *inst) const
358       {
359          assert(inst->exec_size <= 32);
360          assert(inst->exec_size == dispatch_width() ||
361                 force_writemask_all);
362 
363          inst->group = _group;
364          inst->force_writemask_all = force_writemask_all;
365          inst->annotation = annotation.str;
366          inst->ir = annotation.ir;
367 
368          if (block)
369             static_cast<instruction *>(cursor)->insert_before(block, inst);
370          else
371             cursor->insert_before(inst);
372 
373          return inst;
374       }
375 
376       /**
377        * Select \p src0 if the comparison of both sources with the given
378        * conditional mod evaluates to true, otherwise select \p src1.
379        *
380        * Generally useful to get the minimum or maximum of two values.
381        */
382       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)383       emit_minmax(const dst_reg &dst, const src_reg &src0,
384                   const src_reg &src1, brw_conditional_mod mod) const
385       {
386          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
387 
388          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
389                                      fix_unsigned_negate(src1)));
390       }
391 
392       /**
393        * Copy any live channel from \p src to the first channel of the result.
394        */
395       src_reg
emit_uniformize(const src_reg & src)396       emit_uniformize(const src_reg &src) const
397       {
398          /* FIXME: We use a vector chan_index and dst to allow constant and
399           * copy propagration to move result all the way into the consuming
400           * instruction (typically a surface index or sampler index for a
401           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
402           * dispatch. Once we teach const/copy propagation about scalars we
403           * should go back to scalar destinations here.
404           */
405          const fs_builder ubld = exec_all();
406          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
407          const dst_reg dst = vgrf(src.type);
408 
409          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
410          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
411 
412          return src_reg(component(dst, 0));
413       }
414 
415       /**
416        * Assorted arithmetic ops.
417        * @{
418        */
419 #define ALU1(op)                                        \
420       instruction *                                     \
421       op(const dst_reg &dst, const src_reg &src0) const \
422       {                                                 \
423          return emit(BRW_OPCODE_##op, dst, src0);       \
424       }
425 
426 #define ALU2(op)                                                        \
427       instruction *                                                     \
428       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
429       {                                                                 \
430          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
431       }
432 
433 #define ALU2_ACC(op)                                                    \
434       instruction *                                                     \
435       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
436       {                                                                 \
437          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
438          inst->writes_accumulator = true;                               \
439          return inst;                                                   \
440       }
441 
442 #define ALU3(op)                                                        \
443       instruction *                                                     \
444       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
445          const src_reg &src2) const                                     \
446       {                                                                 \
447          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
448       }
449 
450       ALU2(ADD)
ALU2_ACC(ADDC)451       ALU2_ACC(ADDC)
452       ALU2(AND)
453       ALU2(ASR)
454       ALU2(AVG)
455       ALU3(BFE)
456       ALU2(BFI1)
457       ALU3(BFI2)
458       ALU1(BFREV)
459       ALU1(CBIT)
460       ALU2(CMPN)
461       ALU3(CSEL)
462       ALU1(DIM)
463       ALU2(DP2)
464       ALU2(DP3)
465       ALU2(DP4)
466       ALU2(DPH)
467       ALU1(F16TO32)
468       ALU1(F32TO16)
469       ALU1(FBH)
470       ALU1(FBL)
471       ALU1(FRC)
472       ALU2(LINE)
473       ALU1(LZD)
474       ALU2(MAC)
475       ALU2_ACC(MACH)
476       ALU3(MAD)
477       ALU1(MOV)
478       ALU2(MUL)
479       ALU1(NOT)
480       ALU2(OR)
481       ALU2(PLN)
482       ALU1(RNDD)
483       ALU1(RNDE)
484       ALU1(RNDU)
485       ALU1(RNDZ)
486       ALU2(SAD2)
487       ALU2_ACC(SADA2)
488       ALU2(SEL)
489       ALU2(SHL)
490       ALU2(SHR)
491       ALU2_ACC(SUBB)
492       ALU2(XOR)
493 
494 #undef ALU3
495 #undef ALU2_ACC
496 #undef ALU2
497 #undef ALU1
498       /** @} */
499 
500       /**
501        * CMP: Sets the low bit of the destination channels with the result
502        * of the comparison, while the upper bits are undefined, and updates
503        * the flag register with the packed 16 bits of the result.
504        */
505       instruction *
506       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
507           brw_conditional_mod condition) const
508       {
509          /* Take the instruction:
510           *
511           * CMP null<d> src0<f> src1<f>
512           *
513           * Original gen4 does type conversion to the destination type
514           * before comparison, producing garbage results for floating
515           * point comparisons.
516           *
517           * The destination type doesn't matter on newer generations,
518           * so we set the type to match src0 so we can compact the
519           * instruction.
520           */
521          return set_condmod(condition,
522                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
523                                  fix_unsigned_negate(src0),
524                                  fix_unsigned_negate(src1)));
525       }
526 
527       /**
528        * Gen4 predicated IF.
529        */
530       instruction *
IF(brw_predicate predicate)531       IF(brw_predicate predicate) const
532       {
533          return set_predicate(predicate, emit(BRW_OPCODE_IF));
534       }
535 
536       /**
537        * Emit a linear interpolation instruction.
538        */
539       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)540       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
541           const src_reg &a) const
542       {
543          if (shader->devinfo->gen >= 6) {
544             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
545              * we need to reorder the operands.
546              */
547             return emit(BRW_OPCODE_LRP, dst, a, y, x);
548 
549          } else {
550             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
551             const dst_reg y_times_a = vgrf(dst.type);
552             const dst_reg one_minus_a = vgrf(dst.type);
553             const dst_reg x_times_one_minus_a = vgrf(dst.type);
554 
555             MUL(y_times_a, y, a);
556             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
557             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
558             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
559          }
560       }
561 
562       /**
563        * Collect a number of registers in a contiguous range of registers.
564        */
565       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)566       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
567                    unsigned sources, unsigned header_size) const
568       {
569          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
570          inst->header_size = header_size;
571          inst->size_written = header_size * REG_SIZE;
572          for (unsigned i = header_size; i < sources; i++) {
573             inst->size_written +=
574                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
575                      REG_SIZE);
576          }
577 
578          return inst;
579       }
580 
581       backend_shader *shader;
582 
583    private:
584       /**
585        * Workaround for negation of UD registers.  See comment in
586        * fs_generator::generate_code() for more details.
587        */
588       src_reg
fix_unsigned_negate(const src_reg & src)589       fix_unsigned_negate(const src_reg &src) const
590       {
591          if (src.type == BRW_REGISTER_TYPE_UD &&
592              src.negate) {
593             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
594             MOV(temp, src);
595             return src_reg(temp);
596          } else {
597             return src;
598          }
599       }
600 
601       /**
602        * Workaround for source register modes not supported by the ternary
603        * instruction encoding.
604        */
605       src_reg
fix_3src_operand(const src_reg & src)606       fix_3src_operand(const src_reg &src) const
607       {
608          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
609             return src;
610          } else {
611             dst_reg expanded = vgrf(src.type);
612             MOV(expanded, src);
613             return expanded;
614          }
615       }
616 
617       /**
618        * Workaround for source register modes not supported by the math
619        * instruction.
620        */
621       src_reg
fix_math_operand(const src_reg & src)622       fix_math_operand(const src_reg &src) const
623       {
624          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
625           * might be able to do better by doing execsize = 1 math and then
626           * expanding that result out, but we would need to be careful with
627           * masking.
628           *
629           * Gen6 hardware ignores source modifiers (negate and abs) on math
630           * instructions, so we also move to a temp to set those up.
631           *
632           * Gen7 relaxes most of the above restrictions, but still can't use IMM
633           * operands to math
634           */
635          if ((shader->devinfo->gen == 6 &&
636               (src.file == IMM || src.file == UNIFORM ||
637                src.abs || src.negate)) ||
638              (shader->devinfo->gen == 7 && src.file == IMM)) {
639             const dst_reg tmp = vgrf(src.type);
640             MOV(tmp, src);
641             return tmp;
642          } else {
643             return src;
644          }
645       }
646 
647       bblock_t *block;
648       exec_node *cursor;
649 
650       unsigned _dispatch_width;
651       unsigned _group;
652       bool force_writemask_all;
653 
654       /** Debug annotation info. */
655       struct {
656          const char *str;
657          const void *ir;
658       } annotation;
659    };
660 }
661 
662 #endif
663