1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_VEC4_BUILDER_H
26 #define BRW_VEC4_BUILDER_H
27 
28 #include "brw_ir_vec4.h"
29 #include "brw_ir_allocator.h"
30 
31 namespace brw {
32    /**
33     * Toolbox to assemble a VEC4 IR program out of individual instructions.
34     *
35     * This object is meant to have an interface consistent with
36     * brw::fs_builder.  They cannot be fully interchangeable because
37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
38     * vector code.
39     */
40    class vec4_builder {
41    public:
42       /** Type used in this IR to represent a source of an instruction. */
43       typedef brw::src_reg src_reg;
44 
45       /** Type used in this IR to represent the destination of an instruction. */
46       typedef brw::dst_reg dst_reg;
47 
48       /** Type used in this IR to represent an instruction. */
49       typedef vec4_instruction instruction;
50 
51       /**
52        * Construct a vec4_builder that inserts instructions into \p shader.
53        */
54       vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
shader(shader)55          shader(shader), block(NULL), cursor(NULL),
56          _dispatch_width(dispatch_width), _group(0),
57          force_writemask_all(false),
58          annotation()
59       {
60       }
61 
62       /**
63        * Construct a vec4_builder that inserts instructions into \p shader
64        * before instruction \p inst in basic block \p block.  The default
65        * execution controls and debug annotation are initialized from the
66        * instruction passed as argument.
67        */
vec4_builder(backend_shader * shader,bblock_t * block,instruction * inst)68       vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69          shader(shader), block(block), cursor(inst),
70          _dispatch_width(inst->exec_size), _group(inst->group),
71          force_writemask_all(inst->force_writemask_all)
72       {
73          annotation.str = inst->annotation;
74          annotation.ir = inst->ir;
75       }
76 
77       /**
78        * Construct a vec4_builder that inserts instructions before \p cursor
79        * in basic block \p block, inheriting other code generation parameters
80        * from this.
81        */
82       vec4_builder
at(bblock_t * block,exec_node * cursor)83       at(bblock_t *block, exec_node *cursor) const
84       {
85          vec4_builder bld = *this;
86          bld.block = block;
87          bld.cursor = cursor;
88          return bld;
89       }
90 
91       /**
92        * Construct a vec4_builder appending instructions at the end of the
93        * instruction list of the shader, inheriting other code generation
94        * parameters from this.
95        */
96       vec4_builder
at_end()97       at_end() const
98       {
99          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
100       }
101 
102       /**
103        * Construct a builder specifying the default SIMD width and group of
104        * channel enable signals, inheriting other code generation parameters
105        * from this.
106        *
107        * \p n gives the default SIMD width, \p i gives the slot group used for
108        * predication and control flow masking in multiples of \p n channels.
109        */
110       vec4_builder
group(unsigned n,unsigned i)111       group(unsigned n, unsigned i) const
112       {
113          assert(force_writemask_all ||
114                 (n <= dispatch_width() && i < dispatch_width() / n));
115          vec4_builder bld = *this;
116          bld._dispatch_width = n;
117          bld._group += i * n;
118          return bld;
119       }
120 
121       /**
122        * Construct a builder with per-channel control flow execution masking
123        * disabled if \p b is true.  If control flow execution masking is
124        * already disabled this has no effect.
125        */
126       vec4_builder
127       exec_all(bool b = true) const
128       {
129          vec4_builder bld = *this;
130          if (b)
131             bld.force_writemask_all = true;
132          return bld;
133       }
134 
135       /**
136        * Construct a builder with the given debug annotation info.
137        */
138       vec4_builder
139       annotate(const char *str, const void *ir = NULL) const
140       {
141          vec4_builder bld = *this;
142          bld.annotation.str = str;
143          bld.annotation.ir = ir;
144          return bld;
145       }
146 
147       /**
148        * Get the SIMD width in use.
149        */
150       unsigned
dispatch_width()151       dispatch_width() const
152       {
153          return _dispatch_width;
154       }
155 
156       /**
157        * Get the channel group in use.
158        */
159       unsigned
group()160       group() const
161       {
162          return _group;
163       }
164 
165       /**
166        * Allocate a virtual register of natural vector size (four for this IR)
167        * and SIMD width.  \p n gives the amount of space to allocate in
168        * dispatch_width units (which is just enough space for four logical
169        * components in this IR).
170        */
171       dst_reg
172       vgrf(enum brw_reg_type type, unsigned n = 1) const
173       {
174          assert(dispatch_width() <= 32);
175 
176          if (n > 0)
177             return retype(dst_reg(VGRF, shader->alloc.allocate(
178                                      n * DIV_ROUND_UP(type_sz(type), 4))),
179                            type);
180          else
181             return retype(null_reg_ud(), type);
182       }
183 
184       /**
185        * Create a null register of floating type.
186        */
187       dst_reg
null_reg_f()188       null_reg_f() const
189       {
190          return dst_reg(retype(brw_null_vec(dispatch_width()),
191                                BRW_REGISTER_TYPE_F));
192       }
193 
194       /**
195        * Create a null register of signed integer type.
196        */
197       dst_reg
null_reg_d()198       null_reg_d() const
199       {
200          return dst_reg(retype(brw_null_vec(dispatch_width()),
201                                BRW_REGISTER_TYPE_D));
202       }
203 
204       /**
205        * Create a null register of unsigned integer type.
206        */
207       dst_reg
null_reg_ud()208       null_reg_ud() const
209       {
210          return dst_reg(retype(brw_null_vec(dispatch_width()),
211                                BRW_REGISTER_TYPE_UD));
212       }
213 
214       /**
215        * Insert an instruction into the program.
216        */
217       instruction *
emit(const instruction & inst)218       emit(const instruction &inst) const
219       {
220          return emit(new(shader->mem_ctx) instruction(inst));
221       }
222 
223       /**
224        * Create and insert a nullary control instruction into the program.
225        */
226       instruction *
emit(enum opcode opcode)227       emit(enum opcode opcode) const
228       {
229          return emit(instruction(opcode));
230       }
231 
232       /**
233        * Create and insert a nullary instruction into the program.
234        */
235       instruction *
emit(enum opcode opcode,const dst_reg & dst)236       emit(enum opcode opcode, const dst_reg &dst) const
237       {
238          return emit(instruction(opcode, dst));
239       }
240 
241       /**
242        * Create and insert a unary instruction into the program.
243        */
244       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246       {
247          switch (opcode) {
248          case SHADER_OPCODE_RCP:
249          case SHADER_OPCODE_RSQ:
250          case SHADER_OPCODE_SQRT:
251          case SHADER_OPCODE_EXP2:
252          case SHADER_OPCODE_LOG2:
253          case SHADER_OPCODE_SIN:
254          case SHADER_OPCODE_COS:
255             return fix_math_instruction(
256                emit(instruction(opcode, dst,
257                                 fix_math_operand(src0))));
258 
259          default:
260             return emit(instruction(opcode, dst, src0));
261          }
262       }
263 
264       /**
265        * Create and insert a binary instruction into the program.
266        */
267       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269            const src_reg &src1) const
270       {
271          switch (opcode) {
272          case SHADER_OPCODE_POW:
273          case SHADER_OPCODE_INT_QUOTIENT:
274          case SHADER_OPCODE_INT_REMAINDER:
275             return fix_math_instruction(
276                emit(instruction(opcode, dst,
277                                 fix_math_operand(src0),
278                                 fix_math_operand(src1))));
279 
280          default:
281             return emit(instruction(opcode, dst, src0, src1));
282          }
283       }
284 
285       /**
286        * Create and insert a ternary instruction into the program.
287        */
288       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)289       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290            const src_reg &src1, const src_reg &src2) const
291       {
292          switch (opcode) {
293          case BRW_OPCODE_BFE:
294          case BRW_OPCODE_BFI2:
295          case BRW_OPCODE_MAD:
296          case BRW_OPCODE_LRP:
297             return emit(instruction(opcode, dst,
298                                     fix_3src_operand(src0),
299                                     fix_3src_operand(src1),
300                                     fix_3src_operand(src2)));
301 
302          default:
303             return emit(instruction(opcode, dst, src0, src1, src2));
304          }
305       }
306 
307       /**
308        * Insert a preallocated instruction into the program.
309        */
310       instruction *
emit(instruction * inst)311       emit(instruction *inst) const
312       {
313          inst->exec_size = dispatch_width();
314          inst->group = group();
315          inst->force_writemask_all = force_writemask_all;
316          inst->size_written = inst->exec_size * type_sz(inst->dst.type);
317          inst->annotation = annotation.str;
318          inst->ir = annotation.ir;
319 
320          if (block)
321             static_cast<instruction *>(cursor)->insert_before(block, inst);
322          else
323             cursor->insert_before(inst);
324 
325          return inst;
326       }
327 
328       /**
329        * Select \p src0 if the comparison of both sources with the given
330        * conditional mod evaluates to true, otherwise select \p src1.
331        *
332        * Generally useful to get the minimum or maximum of two values.
333        */
334       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)335       emit_minmax(const dst_reg &dst, const src_reg &src0,
336                   const src_reg &src1, brw_conditional_mod mod) const
337       {
338          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
339 
340          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
341                                      fix_unsigned_negate(src1)));
342       }
343 
344       /**
345        * Copy any live channel from \p src to the first channel of the result.
346        */
347       src_reg
emit_uniformize(const src_reg & src)348       emit_uniformize(const src_reg &src) const
349       {
350          const vec4_builder ubld = exec_all();
351          const dst_reg chan_index =
352             writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
353          const dst_reg dst = vgrf(src.type);
354 
355          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
356          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
357 
358          return src_reg(dst);
359       }
360 
361       /**
362        * Assorted arithmetic ops.
363        * @{
364        */
365 #define ALU1(op)                                        \
366       instruction *                                     \
367       op(const dst_reg &dst, const src_reg &src0) const \
368       {                                                 \
369          return emit(BRW_OPCODE_##op, dst, src0);       \
370       }
371 
372 #define ALU2(op)                                                        \
373       instruction *                                                     \
374       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
375       {                                                                 \
376          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
377       }
378 
379 #define ALU2_ACC(op)                                                    \
380       instruction *                                                     \
381       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
382       {                                                                 \
383          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
384          inst->writes_accumulator = true;                               \
385          return inst;                                                   \
386       }
387 
388 #define ALU3(op)                                                        \
389       instruction *                                                     \
390       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
391          const src_reg &src2) const                                     \
392       {                                                                 \
393          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
394       }
395 
396       ALU2(ADD)
ALU2_ACC(ADDC)397       ALU2_ACC(ADDC)
398       ALU2(AND)
399       ALU2(ASR)
400       ALU2(AVG)
401       ALU3(BFE)
402       ALU2(BFI1)
403       ALU3(BFI2)
404       ALU1(BFREV)
405       ALU1(CBIT)
406       ALU2(CMPN)
407       ALU3(CSEL)
408       ALU1(DIM)
409       ALU2(DP2)
410       ALU2(DP3)
411       ALU2(DP4)
412       ALU2(DPH)
413       ALU1(F16TO32)
414       ALU1(F32TO16)
415       ALU1(FBH)
416       ALU1(FBL)
417       ALU1(FRC)
418       ALU2(LINE)
419       ALU1(LZD)
420       ALU2(MAC)
421       ALU2_ACC(MACH)
422       ALU3(MAD)
423       ALU1(MOV)
424       ALU2(MUL)
425       ALU1(NOT)
426       ALU2(OR)
427       ALU2(PLN)
428       ALU1(RNDD)
429       ALU1(RNDE)
430       ALU1(RNDU)
431       ALU1(RNDZ)
432       ALU2(SAD2)
433       ALU2_ACC(SADA2)
434       ALU2(SEL)
435       ALU2(SHL)
436       ALU2(SHR)
437       ALU2_ACC(SUBB)
438       ALU2(XOR)
439 
440 #undef ALU3
441 #undef ALU2_ACC
442 #undef ALU2
443 #undef ALU1
444       /** @} */
445 
446       /**
447        * CMP: Sets the low bit of the destination channels with the result
448        * of the comparison, while the upper bits are undefined, and updates
449        * the flag register with the packed 16 bits of the result.
450        */
451       instruction *
452       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
453           brw_conditional_mod condition) const
454       {
455          /* Take the instruction:
456           *
457           * CMP null<d> src0<f> src1<f>
458           *
459           * Original gen4 does type conversion to the destination type
460           * before comparison, producing garbage results for floating
461           * point comparisons.
462           *
463           * The destination type doesn't matter on newer generations,
464           * so we set the type to match src0 so we can compact the
465           * instruction.
466           */
467          return set_condmod(condition,
468                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
469                                  fix_unsigned_negate(src0),
470                                  fix_unsigned_negate(src1)));
471       }
472 
473       /**
474        * Gen4 predicated IF.
475        */
476       instruction *
IF(brw_predicate predicate)477       IF(brw_predicate predicate) const
478       {
479          return set_predicate(predicate, emit(BRW_OPCODE_IF));
480       }
481 
482       /**
483        * Gen6 IF with embedded comparison.
484        */
485       instruction *
IF(const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)486       IF(const src_reg &src0, const src_reg &src1,
487          brw_conditional_mod condition) const
488       {
489          assert(shader->devinfo->gen == 6);
490          return set_condmod(condition,
491                             emit(BRW_OPCODE_IF,
492                                  null_reg_d(),
493                                  fix_unsigned_negate(src0),
494                                  fix_unsigned_negate(src1)));
495       }
496 
497       /**
498        * Emit a linear interpolation instruction.
499        */
500       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)501       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
502           const src_reg &a) const
503       {
504          if (shader->devinfo->gen >= 6) {
505             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
506              * we need to reorder the operands.
507              */
508             return emit(BRW_OPCODE_LRP, dst, a, y, x);
509 
510          } else {
511             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
512             const dst_reg y_times_a = vgrf(dst.type);
513             const dst_reg one_minus_a = vgrf(dst.type);
514             const dst_reg x_times_one_minus_a = vgrf(dst.type);
515 
516             MUL(y_times_a, y, a);
517             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
518             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
519             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
520          }
521       }
522 
523       backend_shader *shader;
524 
525    protected:
526       /**
527        * Workaround for negation of UD registers.  See comment in
528        * fs_generator::generate_code() for the details.
529        */
530       src_reg
fix_unsigned_negate(const src_reg & src)531       fix_unsigned_negate(const src_reg &src) const
532       {
533          if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
534             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
535             MOV(temp, src);
536             return src_reg(temp);
537          } else {
538             return src;
539          }
540       }
541 
542       /**
543        * Workaround for register access modes not supported by the ternary
544        * instruction encoding.
545        */
546       src_reg
fix_3src_operand(const src_reg & src)547       fix_3src_operand(const src_reg &src) const
548       {
549          /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
550           * able to use vertical stride of zero to replicate the vec4 uniform, like
551           *
552           *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
553           *
554           * But you can't, since vertical stride is always four in three-source
555           * instructions. Instead, insert a MOV instruction to do the replication so
556           * that the three-source instruction can consume it.
557           */
558 
559          /* The MOV is only needed if the source is a uniform or immediate. */
560          if (src.file != UNIFORM && src.file != IMM)
561             return src;
562 
563          if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
564             return src;
565 
566          const dst_reg expanded = vgrf(src.type);
567          emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
568          return src_reg(expanded);
569       }
570 
571       /**
572        * Workaround for register access modes not supported by the math
573        * instruction.
574        */
575       src_reg
fix_math_operand(const src_reg & src)576       fix_math_operand(const src_reg &src) const
577       {
578          /* The gen6 math instruction ignores the source modifiers --
579           * swizzle, abs, negate, and at least some parts of the register
580           * region description.
581           *
582           * Rather than trying to enumerate all these cases, *always* expand the
583           * operand to a temp GRF for gen6.
584           *
585           * For gen7, keep the operand as-is, except if immediate, which gen7 still
586           * can't use.
587           */
588          if (shader->devinfo->gen == 6 ||
589              (shader->devinfo->gen == 7 && src.file == IMM)) {
590             const dst_reg tmp = vgrf(src.type);
591             MOV(tmp, src);
592             return src_reg(tmp);
593          } else {
594             return src;
595          }
596       }
597 
598       /**
599        * Workaround other weirdness of the math instruction.
600        */
601       instruction *
fix_math_instruction(instruction * inst)602       fix_math_instruction(instruction *inst) const
603       {
604          if (shader->devinfo->gen == 6 &&
605              inst->dst.writemask != WRITEMASK_XYZW) {
606             const dst_reg tmp = vgrf(inst->dst.type);
607             MOV(inst->dst, src_reg(tmp));
608             inst->dst = tmp;
609 
610          } else if (shader->devinfo->gen < 6) {
611             const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
612             inst->base_mrf = 1;
613             inst->mlen = sources;
614          }
615 
616          return inst;
617       }
618 
619       bblock_t *block;
620       exec_node *cursor;
621 
622       unsigned _dispatch_width;
623       unsigned _group;
624       bool force_writemask_all;
625 
626       /** Debug annotation info. */
627       struct {
628          const char *str;
629          const void *ir;
630       } annotation;
631    };
632 }
633 
634 #endif
635