1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_generator.cpp
25  *
26  * This file supports generating code from the FS LIR to the actual
27  * native instructions.
28  */
29 
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33 
34 static enum brw_reg_file
brw_file_from_reg(fs_reg * reg)35 brw_file_from_reg(fs_reg *reg)
36 {
37    switch (reg->file) {
38    case ARF:
39       return BRW_ARCHITECTURE_REGISTER_FILE;
40    case FIXED_GRF:
41    case VGRF:
42       return BRW_GENERAL_REGISTER_FILE;
43    case MRF:
44       return BRW_MESSAGE_REGISTER_FILE;
45    case IMM:
46       return BRW_IMMEDIATE_VALUE;
47    case BAD_FILE:
48    case ATTR:
49    case UNIFORM:
50       unreachable("not reached");
51    }
52    return BRW_ARCHITECTURE_REGISTER_FILE;
53 }
54 
55 static struct brw_reg
brw_reg_from_fs_reg(const struct gen_device_info * devinfo,fs_inst * inst,fs_reg * reg,bool compressed)56 brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
57                     fs_reg *reg, bool compressed)
58 {
59    struct brw_reg brw_reg;
60 
61    switch (reg->file) {
62    case MRF:
63       assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64       /* Fallthrough */
65    case VGRF:
66       if (reg->stride == 0) {
67          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68       } else {
69          /* From the Haswell PRM:
70           *
71           *  "VertStride must be used to cross GRF register boundaries. This
72           *   rule implies that elements within a 'Width' cannot cross GRF
73           *   boundaries."
74           *
75           * The maximum width value that could satisfy this restriction is:
76           */
77          const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78 
79          /* Because the hardware can only split source regions at a whole
80           * multiple of width during decompression (i.e. vertically), clamp
81           * the value obtained above to the physical execution size of a
82           * single decompressed chunk of the instruction:
83           */
84          const unsigned phys_width = compressed ? inst->exec_size / 2 :
85                                      inst->exec_size;
86 
87          /* XXX - The equation above is strictly speaking not correct on
88           *       hardware that supports unbalanced GRF writes -- On Gen9+
89           *       each decompressed chunk of the instruction may have a
90           *       different execution size when the number of components
91           *       written to each destination GRF is not the same.
92           */
93          const unsigned width = MIN2(reg_width, phys_width);
94          brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
95          brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 
97          if (devinfo->gen == 7 && !devinfo->is_haswell) {
98             /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
99              *  "Each DF (Double Float) operand uses an element size of 4 rather
100              *   than 8 and all regioning parameters are twice what the values
101              *   would be based on the true element size: ExecSize, Width,
102              *   HorzStride, and VertStride. Each DF operand uses a pair of
103              *   channels and all masking and swizzing should be adjusted
104              *   appropriately."
105              *
106              * From the IvyBridge PRM (Special Requirements for Handling Double
107              * Precision Data Types, page 71):
108              *  "In Align1 mode, all regioning parameters like stride, execution
109              *   size, and width must use the syntax of a pair of packed
110              *   floats. The offsets for these data types must be 64-bit
111              *   aligned. The execution size and regioning parameters are in terms
112              *   of floats."
113              *
114              * Summarized: when handling DF-typed arguments, ExecSize,
115              * VertStride, and Width must be doubled.
116              *
117              * It applies to BayTrail too.
118              */
119             if (type_sz(reg->type) == 8) {
120                brw_reg.width++;
121                if (brw_reg.vstride > 0)
122                   brw_reg.vstride++;
123                assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
124             }
125 
126             /* When converting from DF->F, we set the destination stride to 2
127              * because each d2f conversion implicitly writes 2 floats, being
128              * the first one the converted value. IVB/BYT actually writes two
129              * F components per SIMD channel, and every other component is
130              * filled with garbage.
131              */
132             if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
133                 type_sz(inst->dst.type) < 8) {
134                assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
135                brw_reg.hstride--;
136             }
137          }
138       }
139 
140       brw_reg = retype(brw_reg, reg->type);
141       brw_reg = byte_offset(brw_reg, reg->offset);
142       brw_reg.abs = reg->abs;
143       brw_reg.negate = reg->negate;
144       break;
145    case ARF:
146    case FIXED_GRF:
147    case IMM:
148       assert(reg->offset == 0);
149       brw_reg = reg->as_brw_reg();
150       break;
151    case BAD_FILE:
152       /* Probably unused. */
153       brw_reg = brw_null_reg();
154       break;
155    case ATTR:
156    case UNIFORM:
157       unreachable("not reached");
158    }
159 
160    /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
161     * region, but on IVB and BYT DF regions must be programmed in terms of
162     * floats. A <0,2,1> region accomplishes this.
163     */
164    if (devinfo->gen == 7 && !devinfo->is_haswell &&
165        type_sz(reg->type) == 8 &&
166        brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
167        brw_reg.width == BRW_WIDTH_1 &&
168        brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
169       brw_reg.width = BRW_WIDTH_2;
170       brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
171    }
172 
173    return brw_reg;
174 }
175 
fs_generator(const struct brw_compiler * compiler,void * log_data,void * mem_ctx,const void * key,struct brw_stage_prog_data * prog_data,unsigned promoted_constants,bool runtime_check_aads_emit,gl_shader_stage stage)176 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
177                            void *mem_ctx,
178                            const void *key,
179                            struct brw_stage_prog_data *prog_data,
180                            unsigned promoted_constants,
181                            bool runtime_check_aads_emit,
182                            gl_shader_stage stage)
183 
184    : compiler(compiler), log_data(log_data),
185      devinfo(compiler->devinfo), key(key),
186      prog_data(prog_data),
187      promoted_constants(promoted_constants),
188      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
189      stage(stage), mem_ctx(mem_ctx)
190 {
191    p = rzalloc(mem_ctx, struct brw_codegen);
192    brw_init_codegen(devinfo, p, mem_ctx);
193 
194    /* In the FS code generator, we are very careful to ensure that we always
195     * set the right execution size so we don't need the EU code to "help" us
196     * by trying to infer it.  Sometimes, it infers the wrong thing.
197     */
198    p->automatic_exec_sizes = false;
199 }
200 
~fs_generator()201 fs_generator::~fs_generator()
202 {
203 }
204 
205 class ip_record : public exec_node {
206 public:
207    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
208 
ip_record(int ip)209    ip_record(int ip)
210    {
211       this->ip = ip;
212    }
213 
214    int ip;
215 };
216 
217 bool
patch_discard_jumps_to_fb_writes()218 fs_generator::patch_discard_jumps_to_fb_writes()
219 {
220    if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
221       return false;
222 
223    int scale = brw_jump_scale(p->devinfo);
224 
225    /* There is a somewhat strange undocumented requirement of using
226     * HALT, according to the simulator.  If some channel has HALTed to
227     * a particular UIP, then by the end of the program, every channel
228     * must have HALTed to that UIP.  Furthermore, the tracking is a
229     * stack, so you can't do the final halt of a UIP after starting
230     * halting to a new UIP.
231     *
232     * Symptoms of not emitting this instruction on actual hardware
233     * included GPU hangs and sparkly rendering on the piglit discard
234     * tests.
235     */
236    brw_inst *last_halt = gen6_HALT(p);
237    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
238    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
239 
240    int ip = p->nr_insn;
241 
242    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
243       brw_inst *patch = &p->store[patch_ip->ip];
244 
245       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
246       /* HALT takes a half-instruction distance from the pre-incremented IP. */
247       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
248    }
249 
250    this->discard_halt_patches.make_empty();
251    return true;
252 }
253 
254 void
fire_fb_write(fs_inst * inst,struct brw_reg payload,struct brw_reg implied_header,GLuint nr)255 fs_generator::fire_fb_write(fs_inst *inst,
256                             struct brw_reg payload,
257                             struct brw_reg implied_header,
258                             GLuint nr)
259 {
260    uint32_t msg_control;
261 
262    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
263 
264    if (devinfo->gen < 6) {
265       brw_push_insn_state(p);
266       brw_set_default_exec_size(p, BRW_EXECUTE_8);
267       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
268       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
269       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
270       brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
271       brw_pop_insn_state(p);
272    }
273 
274    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
275       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
276    else if (prog_data->dual_src_blend) {
277       if (!inst->group)
278          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
279       else
280          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
281    } else if (inst->exec_size == 16)
282       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
283    else
284       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
285 
286    /* We assume render targets start at 0, because headerless FB write
287     * messages set "Render Target Index" to 0.  Using a different binding
288     * table index would make it impossible to use headerless messages.
289     */
290    const uint32_t surf_index = inst->target;
291 
292    bool last_render_target = inst->eot ||
293                              (prog_data->dual_src_blend && dispatch_width == 16);
294 
295 
296    brw_fb_WRITE(p,
297                 payload,
298                 implied_header,
299                 msg_control,
300                 surf_index,
301                 nr,
302                 0,
303                 inst->eot,
304                 last_render_target,
305                 inst->header_size != 0);
306 
307    brw_mark_surface_used(&prog_data->base, surf_index);
308 }
309 
310 void
generate_fb_write(fs_inst * inst,struct brw_reg payload)311 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
312 {
313    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
314    const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
315    struct brw_reg implied_header;
316 
317    if (devinfo->gen < 8 && !devinfo->is_haswell) {
318       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
319    }
320 
321    if (inst->base_mrf >= 0)
322       payload = brw_message_reg(inst->base_mrf);
323 
324    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
325     * move, here's g1.
326     */
327    if (inst->header_size != 0) {
328       brw_push_insn_state(p);
329       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
330       brw_set_default_exec_size(p, BRW_EXECUTE_1);
331       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
332       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
333       brw_set_default_flag_reg(p, 0, 0);
334 
335       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
336        * present.
337        */
338       if (prog_data->uses_kill) {
339          struct brw_reg pixel_mask;
340 
341          if (devinfo->gen >= 6)
342             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
343          else
344             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
345 
346          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
347       }
348 
349       if (devinfo->gen >= 6) {
350          brw_push_insn_state(p);
351          brw_set_default_exec_size(p, BRW_EXECUTE_16);
352 	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
353 	 brw_MOV(p,
354 		 retype(payload, BRW_REGISTER_TYPE_UD),
355 		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
356          brw_pop_insn_state(p);
357 
358          if (inst->target > 0 && key->replicate_alpha) {
359             /* Set "Source0 Alpha Present to RenderTarget" bit in message
360              * header.
361              */
362             brw_OR(p,
363 		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
364 		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
365 		   brw_imm_ud(0x1 << 11));
366          }
367 
368 	 if (inst->target > 0) {
369 	    /* Set the render target index for choosing BLEND_STATE. */
370 	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
371                               BRW_REGISTER_TYPE_UD),
372 		    brw_imm_ud(inst->target));
373 	 }
374 
375          /* Set computes stencil to render target */
376          if (prog_data->computed_stencil) {
377             brw_OR(p,
378                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
379                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
380                    brw_imm_ud(0x1 << 14));
381          }
382 
383 	 implied_header = brw_null_reg();
384       } else {
385 	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
386       }
387 
388       brw_pop_insn_state(p);
389    } else {
390       implied_header = brw_null_reg();
391    }
392 
393    if (!runtime_check_aads_emit) {
394       fire_fb_write(inst, payload, implied_header, inst->mlen);
395    } else {
396       /* This can only happen in gen < 6 */
397       assert(devinfo->gen < 6);
398 
399       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
400 
401       /* Check runtime bit to detect if we have to send AA data or not */
402       brw_push_insn_state(p);
403       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
404       brw_set_default_exec_size(p, BRW_EXECUTE_1);
405       brw_AND(p,
406               v1_null_ud,
407               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
408               brw_imm_ud(1<<26));
409       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
410 
411       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
412       brw_pop_insn_state(p);
413       {
414          /* Don't send AA data */
415          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
416       }
417       brw_land_fwd_jump(p, jmp);
418       fire_fb_write(inst, payload, implied_header, inst->mlen);
419    }
420 }
421 
422 void
generate_fb_read(fs_inst * inst,struct brw_reg dst,struct brw_reg payload)423 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
424                                struct brw_reg payload)
425 {
426    assert(inst->size_written % REG_SIZE == 0);
427    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
428    /* We assume that render targets start at binding table index 0. */
429    const unsigned surf_index = inst->target;
430 
431    gen9_fb_READ(p, dst, payload, surf_index,
432                 inst->header_size, inst->size_written / REG_SIZE,
433                 prog_data->persample_dispatch);
434 
435    brw_mark_surface_used(&prog_data->base, surf_index);
436 }
437 
438 void
generate_mov_indirect(fs_inst * inst,struct brw_reg dst,struct brw_reg reg,struct brw_reg indirect_byte_offset)439 fs_generator::generate_mov_indirect(fs_inst *inst,
440                                     struct brw_reg dst,
441                                     struct brw_reg reg,
442                                     struct brw_reg indirect_byte_offset)
443 {
444    assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
445    assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
446    assert(!reg.abs && !reg.negate);
447    assert(reg.type == dst.type);
448 
449    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
450 
451    if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
452       imm_byte_offset += indirect_byte_offset.ud;
453 
454       reg.nr = imm_byte_offset / REG_SIZE;
455       reg.subnr = imm_byte_offset % REG_SIZE;
456       brw_MOV(p, dst, reg);
457    } else {
458       /* Prior to Broadwell, there are only 8 address registers. */
459       assert(inst->exec_size <= 8 || devinfo->gen >= 8);
460 
461       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
462       struct brw_reg addr = vec8(brw_address_reg(0));
463 
464       /* The destination stride of an instruction (in bytes) must be greater
465        * than or equal to the size of the rest of the instruction.  Since the
466        * address register is of type UW, we can't use a D-type instruction.
467        * In order to get around this, re retype to UW and use a stride.
468        */
469       indirect_byte_offset =
470          retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
471 
472       /* There are a number of reasons why we don't use the base offset here.
473        * One reason is that the field is only 9 bits which means we can only
474        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
475        * section "Register Region Restrictions":
476        *
477        *    "The lower bits of the AddressImmediate must not overflow to
478        *    change the register address.  The lower 5 bits of Address
479        *    Immediate when added to lower 5 bits of address register gives
480        *    the sub-register offset. The upper bits of Address Immediate
481        *    when added to upper bits of address register gives the register
482        *    address. Any overflow from sub-register offset is dropped."
483        *
484        * Since the indirect may cause us to cross a register boundary, this
485        * makes the base offset almost useless.  We could try and do something
486        * clever where we use a actual base offset if base_offset % 32 == 0 but
487        * that would mean we were generating different code depending on the
488        * base offset.  Instead, for the sake of consistency, we'll just do the
489        * add ourselves.  This restriction is only listed in the Haswell PRM
490        * but empirical testing indicates that it applies on all older
491        * generations and is lifted on Broadwell.
492        *
493        * In the end, while base_offset is nice to look at in the generated
494        * code, using it saves us 0 instructions and would require quite a bit
495        * of case-by-case work.  It's just not worth it.
496        */
497       brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
498 
499       if (type_sz(reg.type) > 4 &&
500           ((devinfo->gen == 7 && !devinfo->is_haswell) ||
501            devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
502          /* IVB has an issue (which we found empirically) where it reads two
503           * address register components per channel for indirectly addressed
504           * 64-bit sources.
505           *
506           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
507           *
508           *    "When source or destination datatype is 64b or operation is
509           *    integer DWord multiply, indirect addressing must not be used."
510           *
511           * To work around both of these, we do two integer MOVs insead of one
512           * 64-bit MOV.  Because no double value should ever cross a register
513           * boundary, it's safe to use the immediate offset in the indirect
514           * here to handle adding 4 bytes to the offset and avoid the extra
515           * ADD to the register file.
516           */
517          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
518                     retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
519          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
520                     retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
521       } else {
522          struct brw_reg ind_src = brw_VxH_indirect(0, 0);
523 
524          brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
525 
526          if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
527              !inst->get_next()->is_tail_sentinel() &&
528              ((fs_inst *)inst->get_next())->mlen > 0) {
529             /* From the Sandybridge PRM:
530              *
531              *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
532              *    instruction that “indexed/indirect” source AND is followed
533              *    by a send, the instruction requires a “Switch”. This is to
534              *    avoid race condition where send may dispatch before MRF is
535              *    updated."
536              */
537             brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
538          }
539       }
540    }
541 }
542 
543 void
generate_urb_read(fs_inst * inst,struct brw_reg dst,struct brw_reg header)544 fs_generator::generate_urb_read(fs_inst *inst,
545                                 struct brw_reg dst,
546                                 struct brw_reg header)
547 {
548    assert(inst->size_written % REG_SIZE == 0);
549    assert(header.file == BRW_GENERAL_REGISTER_FILE);
550    assert(header.type == BRW_REGISTER_TYPE_UD);
551 
552    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
553    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
554    brw_set_src0(p, send, header);
555    brw_set_src1(p, send, brw_imm_ud(0u));
556 
557    brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
558    brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
559 
560    if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
561       brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
562 
563    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
564    brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
565    brw_inst_set_header_present(p->devinfo, send, true);
566    brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
567 }
568 
569 void
generate_urb_write(fs_inst * inst,struct brw_reg payload)570 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
571 {
572    brw_inst *insn;
573 
574     /* WaClearTDRRegBeforeEOTForNonPS.
575      *
576      *   WA: Clear tdr register before send EOT in all non-PS shader kernels
577      *
578      *   mov(8) tdr0:ud 0x0:ud {NoMask}"
579      */
580    if (inst->eot && p->devinfo->gen == 10) {
581       brw_push_insn_state(p);
582       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
583       brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0));
584       brw_pop_insn_state(p);
585    }
586 
587    insn = brw_next_insn(p, BRW_OPCODE_SEND);
588 
589    brw_set_dest(p, insn, brw_null_reg());
590    brw_set_src0(p, insn, payload);
591    brw_set_src1(p, insn, brw_imm_d(0));
592 
593    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
594    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
595 
596    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
597        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
598       brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
599 
600    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
601        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
602       brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
603 
604    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
605    brw_inst_set_rlen(p->devinfo, insn, 0);
606    brw_inst_set_eot(p->devinfo, insn, inst->eot);
607    brw_inst_set_header_present(p->devinfo, insn, true);
608    brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
609 }
610 
611 void
generate_cs_terminate(fs_inst * inst,struct brw_reg payload)612 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
613 {
614    struct brw_inst *insn;
615 
616    insn = brw_next_insn(p, BRW_OPCODE_SEND);
617 
618    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
619    brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
620    brw_set_src1(p, insn, brw_imm_d(0));
621 
622    /* Terminate a compute shader by sending a message to the thread spawner.
623     */
624    brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
625    brw_inst_set_mlen(devinfo, insn, 1);
626    brw_inst_set_rlen(devinfo, insn, 0);
627    brw_inst_set_eot(devinfo, insn, inst->eot);
628    brw_inst_set_header_present(devinfo, insn, false);
629 
630    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
631    brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
632 
633    /* Note that even though the thread has a URB resource associated with it,
634     * we set the "do not dereference URB" bit, because the URB resource is
635     * managed by the fixed-function unit, so it will free it automatically.
636     */
637    brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
638 
639    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
640 }
641 
642 void
generate_barrier(fs_inst * inst,struct brw_reg src)643 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
644 {
645    brw_barrier(p, src);
646    brw_WAIT(p);
647 }
648 
649 void
generate_linterp(fs_inst * inst,struct brw_reg dst,struct brw_reg * src)650 fs_generator::generate_linterp(fs_inst *inst,
651 			     struct brw_reg dst, struct brw_reg *src)
652 {
653    /* PLN reads:
654     *                      /   in SIMD16   \
655     *    -----------------------------------
656     *   | src1+0 | src1+1 | src1+2 | src1+3 |
657     *   |-----------------------------------|
658     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
659     *    -----------------------------------
660     *
661     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
662     *
663     *    -----------------------------------
664     *   | src1+0 | src1+1 | src1+2 | src1+3 |
665     *   |-----------------------------------|
666     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
667     *   |-----------------------------------|
668     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
669     *    -----------------------------------
670     *
671     * See also: emit_interpolation_setup_gen4().
672     */
673    struct brw_reg delta_x = src[0];
674    struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
675    struct brw_reg interp = src[1];
676 
677    if (devinfo->has_pln &&
678        (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
679       brw_PLN(p, dst, interp, delta_x);
680    } else {
681       brw_LINE(p, brw_null_reg(), interp, delta_x);
682       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
683    }
684 }
685 
686 void
generate_get_buffer_size(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg surf_index)687 fs_generator::generate_get_buffer_size(fs_inst *inst,
688                                        struct brw_reg dst,
689                                        struct brw_reg src,
690                                        struct brw_reg surf_index)
691 {
692    assert(devinfo->gen >= 7);
693    assert(surf_index.file == BRW_IMMEDIATE_VALUE);
694 
695    uint32_t simd_mode;
696    int rlen = 4;
697 
698    switch (inst->exec_size) {
699    case 8:
700       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
701       break;
702    case 16:
703       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
704       break;
705    default:
706       unreachable("Invalid width for texture instruction");
707    }
708 
709    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
710       rlen = 8;
711       dst = vec16(dst);
712    }
713 
714    brw_SAMPLE(p,
715               retype(dst, BRW_REGISTER_TYPE_UW),
716               inst->base_mrf,
717               src,
718               surf_index.ud,
719               0,
720               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
721               rlen, /* response length */
722               inst->mlen,
723               inst->header_size > 0,
724               simd_mode,
725               BRW_SAMPLER_RETURN_FORMAT_SINT32);
726 
727    brw_mark_surface_used(prog_data, surf_index.ud);
728 }
729 
730 void
generate_tex(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg surface_index,struct brw_reg sampler_index)731 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
732                            struct brw_reg surface_index,
733                            struct brw_reg sampler_index)
734 {
735    assert(inst->size_written % REG_SIZE == 0);
736    int msg_type = -1;
737    uint32_t simd_mode;
738    uint32_t return_format;
739    bool is_combined_send = inst->eot;
740 
741    switch (dst.type) {
742    case BRW_REGISTER_TYPE_D:
743       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
744       break;
745    case BRW_REGISTER_TYPE_UD:
746       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
747       break;
748    default:
749       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
750       break;
751    }
752 
753    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
754     * is set as part of the message descriptor.  On gen4, the PRM seems to
755     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
756     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
757     * gone from the message descriptor entirely and you just get UINT32 all
758     * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
759     * just stomp it to UINT32 all the time.
760     */
761    if (inst->opcode == SHADER_OPCODE_TXS)
762       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
763 
764    switch (inst->exec_size) {
765    case 8:
766       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
767       break;
768    case 16:
769       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
770       break;
771    default:
772       unreachable("Invalid width for texture instruction");
773    }
774 
775    if (devinfo->gen >= 5) {
776       switch (inst->opcode) {
777       case SHADER_OPCODE_TEX:
778 	 if (inst->shadow_compare) {
779 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
780 	 } else {
781 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
782 	 }
783 	 break;
784       case FS_OPCODE_TXB:
785 	 if (inst->shadow_compare) {
786 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
787 	 } else {
788 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
789 	 }
790 	 break;
791       case SHADER_OPCODE_TXL:
792 	 if (inst->shadow_compare) {
793 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
794 	 } else {
795 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
796 	 }
797 	 break;
798       case SHADER_OPCODE_TXL_LZ:
799          assert(devinfo->gen >= 9);
800 	 if (inst->shadow_compare) {
801             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
802          } else {
803             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
804          }
805          break;
806       case SHADER_OPCODE_TXS:
807 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
808 	 break;
809       case SHADER_OPCODE_TXD:
810          if (inst->shadow_compare) {
811             /* Gen7.5+.  Otherwise, lowered in NIR */
812             assert(devinfo->gen >= 8 || devinfo->is_haswell);
813             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
814          } else {
815             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
816          }
817 	 break;
818       case SHADER_OPCODE_TXF:
819 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
820 	 break;
821       case SHADER_OPCODE_TXF_LZ:
822          assert(devinfo->gen >= 9);
823          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
824          break;
825       case SHADER_OPCODE_TXF_CMS_W:
826          assert(devinfo->gen >= 9);
827          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
828          break;
829       case SHADER_OPCODE_TXF_CMS:
830          if (devinfo->gen >= 7)
831             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
832          else
833             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
834          break;
835       case SHADER_OPCODE_TXF_UMS:
836          assert(devinfo->gen >= 7);
837          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
838          break;
839       case SHADER_OPCODE_TXF_MCS:
840          assert(devinfo->gen >= 7);
841          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
842          break;
843       case SHADER_OPCODE_LOD:
844          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
845          break;
846       case SHADER_OPCODE_TG4:
847          if (inst->shadow_compare) {
848             assert(devinfo->gen >= 7);
849             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
850          } else {
851             assert(devinfo->gen >= 6);
852             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
853          }
854          break;
855       case SHADER_OPCODE_TG4_OFFSET:
856          assert(devinfo->gen >= 7);
857          if (inst->shadow_compare) {
858             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
859          } else {
860             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
861          }
862          break;
863       case SHADER_OPCODE_SAMPLEINFO:
864          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
865          break;
866       default:
867 	 unreachable("not reached");
868       }
869    } else {
870       switch (inst->opcode) {
871       case SHADER_OPCODE_TEX:
872 	 /* Note that G45 and older determines shadow compare and dispatch width
873 	  * from message length for most messages.
874 	  */
875          if (inst->exec_size == 8) {
876             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
877             if (inst->shadow_compare) {
878                assert(inst->mlen == 6);
879             } else {
880                assert(inst->mlen <= 4);
881             }
882          } else {
883             if (inst->shadow_compare) {
884                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
885                assert(inst->mlen == 9);
886             } else {
887                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
888                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
889             }
890          }
891 	 break;
892       case FS_OPCODE_TXB:
893 	 if (inst->shadow_compare) {
894             assert(inst->exec_size == 8);
895 	    assert(inst->mlen == 6);
896 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
897 	 } else {
898 	    assert(inst->mlen == 9);
899 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
900 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
901 	 }
902 	 break;
903       case SHADER_OPCODE_TXL:
904 	 if (inst->shadow_compare) {
905             assert(inst->exec_size == 8);
906 	    assert(inst->mlen == 6);
907 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
908 	 } else {
909 	    assert(inst->mlen == 9);
910 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
911 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
912 	 }
913 	 break;
914       case SHADER_OPCODE_TXD:
915 	 /* There is no sample_d_c message; comparisons are done manually */
916          assert(inst->exec_size == 8);
917 	 assert(inst->mlen == 7 || inst->mlen == 10);
918 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
919 	 break;
920       case SHADER_OPCODE_TXF:
921          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
922 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
923 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
924 	 break;
925       case SHADER_OPCODE_TXS:
926 	 assert(inst->mlen == 3);
927 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
928 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
929 	 break;
930       default:
931 	 unreachable("not reached");
932       }
933    }
934    assert(msg_type != -1);
935 
936    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
937       dst = vec16(dst);
938    }
939 
940    assert(devinfo->gen < 7 || inst->header_size == 0 ||
941           src.file == BRW_GENERAL_REGISTER_FILE);
942 
943    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
944 
945    /* Load the message header if present.  If there's a texture offset,
946     * we need to set it up explicitly and load the offset bitfield.
947     * Otherwise, we can use an implied move from g0 to the first message reg.
948     */
949    if (inst->header_size != 0 && devinfo->gen < 7) {
950       if (devinfo->gen < 6 && !inst->offset) {
951          /* Set up an implied move from g0 to the MRF. */
952          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
953       } else {
954          assert(inst->base_mrf != -1);
955          struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
956 
957          brw_push_insn_state(p);
958          brw_set_default_exec_size(p, BRW_EXECUTE_8);
959          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
960          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
961          /* Explicitly set up the message header by copying g0 to the MRF. */
962          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
963 
964          brw_set_default_exec_size(p, BRW_EXECUTE_1);
965          if (inst->offset) {
966             /* Set the offset bits in DWord 2. */
967             brw_MOV(p, get_element_ud(header_reg, 2),
968                        brw_imm_ud(inst->offset));
969          }
970 
971          brw_pop_insn_state(p);
972       }
973    }
974 
975    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
976          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
977          ? prog_data->binding_table.gather_texture_start
978          : prog_data->binding_table.texture_start;
979 
980    if (surface_index.file == BRW_IMMEDIATE_VALUE &&
981        sampler_index.file == BRW_IMMEDIATE_VALUE) {
982       uint32_t surface = surface_index.ud;
983       uint32_t sampler = sampler_index.ud;
984 
985       brw_SAMPLE(p,
986                  retype(dst, BRW_REGISTER_TYPE_UW),
987                  inst->base_mrf,
988                  src,
989                  surface + base_binding_table_index,
990                  sampler % 16,
991                  msg_type,
992                  inst->size_written / REG_SIZE,
993                  inst->mlen,
994                  inst->header_size != 0,
995                  simd_mode,
996                  return_format);
997 
998       brw_mark_surface_used(prog_data, surface + base_binding_table_index);
999    } else {
1000       /* Non-const sampler index */
1001 
1002       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1003       struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1004       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
1005 
1006       brw_push_insn_state(p);
1007       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1008       brw_set_default_access_mode(p, BRW_ALIGN_1);
1009       brw_set_default_exec_size(p, BRW_EXECUTE_1);
1010 
1011       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1012          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
1013       } else {
1014          if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
1015             brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
1016          } else {
1017             brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
1018             brw_OR(p, addr, addr, surface_reg);
1019          }
1020       }
1021       if (base_binding_table_index)
1022          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1023       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1024 
1025       brw_pop_insn_state(p);
1026 
1027       /* dst = send(offset, a0.0 | <descriptor>) */
1028       brw_inst *insn = brw_send_indirect_message(
1029          p, BRW_SFID_SAMPLER, dst, src, addr);
1030       brw_set_sampler_message(p, insn,
1031                               0 /* surface */,
1032                               0 /* sampler */,
1033                               msg_type,
1034                               inst->size_written / REG_SIZE,
1035                               inst->mlen /* mlen */,
1036                               inst->header_size != 0 /* header */,
1037                               simd_mode,
1038                               return_format);
1039 
1040       /* visitor knows more than we do about the surface limit required,
1041        * so has already done marking.
1042        */
1043    }
1044 
1045    if (is_combined_send) {
1046       brw_inst_set_eot(p->devinfo, brw_last_inst, true);
1047       brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1048    }
1049 }
1050 
1051 
1052 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1053  * looking like:
1054  *
1055  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1056  *
1057  * Ideally, we want to produce:
1058  *
1059  *           DDX                     DDY
1060  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1061  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1062  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1063  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1064  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1065  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1066  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1067  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1068  *
1069  * and add another set of two more subspans if in 16-pixel dispatch mode.
1070  *
1071  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1072  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1073  * pair.  But the ideal approximation may impose a huge performance cost on
1074  * sample_d.  On at least Haswell, sample_d instruction does some
1075  * optimizations if the same LOD is used for all pixels in the subspan.
1076  *
1077  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1078  * appropriate swizzling.
1079  */
1080 void
generate_ddx(enum opcode opcode,struct brw_reg dst,struct brw_reg src)1081 fs_generator::generate_ddx(enum opcode opcode,
1082                            struct brw_reg dst, struct brw_reg src)
1083 {
1084    unsigned vstride, width;
1085 
1086    if (opcode == FS_OPCODE_DDX_FINE) {
1087       /* produce accurate derivatives */
1088       vstride = BRW_VERTICAL_STRIDE_2;
1089       width = BRW_WIDTH_2;
1090    } else {
1091       /* replicate the derivative at the top-left pixel to other pixels */
1092       vstride = BRW_VERTICAL_STRIDE_4;
1093       width = BRW_WIDTH_4;
1094    }
1095 
1096    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1097                                  src.negate, src.abs,
1098 				 BRW_REGISTER_TYPE_F,
1099 				 vstride,
1100 				 width,
1101 				 BRW_HORIZONTAL_STRIDE_0,
1102 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1103    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1104                                  src.negate, src.abs,
1105 				 BRW_REGISTER_TYPE_F,
1106 				 vstride,
1107 				 width,
1108 				 BRW_HORIZONTAL_STRIDE_0,
1109 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1110    brw_ADD(p, dst, src0, negate(src1));
1111 }
1112 
1113 /* The negate_value boolean is used to negate the derivative computation for
1114  * FBOs, since they place the origin at the upper left instead of the lower
1115  * left.
1116  */
1117 void
generate_ddy(enum opcode opcode,struct brw_reg dst,struct brw_reg src)1118 fs_generator::generate_ddy(enum opcode opcode,
1119                            struct brw_reg dst, struct brw_reg src)
1120 {
1121    if (opcode == FS_OPCODE_DDY_FINE) {
1122       /* produce accurate derivatives */
1123       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1124                                     src.negate, src.abs,
1125                                     BRW_REGISTER_TYPE_F,
1126                                     BRW_VERTICAL_STRIDE_4,
1127                                     BRW_WIDTH_4,
1128                                     BRW_HORIZONTAL_STRIDE_1,
1129                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1130       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1131                                     src.negate, src.abs,
1132                                     BRW_REGISTER_TYPE_F,
1133                                     BRW_VERTICAL_STRIDE_4,
1134                                     BRW_WIDTH_4,
1135                                     BRW_HORIZONTAL_STRIDE_1,
1136                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1137       brw_push_insn_state(p);
1138       brw_set_default_access_mode(p, BRW_ALIGN_16);
1139       brw_ADD(p, dst, negate(src0), src1);
1140       brw_pop_insn_state(p);
1141    } else {
1142       /* replicate the derivative at the top-left pixel to other pixels */
1143       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1144                                     src.negate, src.abs,
1145                                     BRW_REGISTER_TYPE_F,
1146                                     BRW_VERTICAL_STRIDE_4,
1147                                     BRW_WIDTH_4,
1148                                     BRW_HORIZONTAL_STRIDE_0,
1149                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1150       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1151                                     src.negate, src.abs,
1152                                     BRW_REGISTER_TYPE_F,
1153                                     BRW_VERTICAL_STRIDE_4,
1154                                     BRW_WIDTH_4,
1155                                     BRW_HORIZONTAL_STRIDE_0,
1156                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1157       brw_ADD(p, dst, negate(src0), src1);
1158    }
1159 }
1160 
1161 void
generate_discard_jump(fs_inst * inst)1162 fs_generator::generate_discard_jump(fs_inst *inst)
1163 {
1164    assert(devinfo->gen >= 6);
1165 
1166    /* This HALT will be patched up at FB write time to point UIP at the end of
1167     * the program, and at brw_uip_jip() JIP will be set to the end of the
1168     * current block (or the program).
1169     */
1170    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1171    gen6_HALT(p);
1172 }
1173 
1174 void
generate_scratch_write(fs_inst * inst,struct brw_reg src)1175 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1176 {
1177    /* The 32-wide messages only respect the first 16-wide half of the channel
1178     * enable signals which are replicated identically for the second group of
1179     * 16 channels, so we cannot use them unless the write is marked
1180     * force_writemask_all.
1181     */
1182    const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1183                                MIN2(16, inst->exec_size);
1184    const unsigned block_size = 4 * lower_size / REG_SIZE;
1185    assert(inst->mlen != 0);
1186 
1187    brw_push_insn_state(p);
1188    brw_set_default_exec_size(p, cvt(lower_size) - 1);
1189    brw_set_default_compression(p, lower_size > 8);
1190 
1191    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1192       brw_set_default_group(p, inst->group + lower_size * i);
1193 
1194       brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1195               retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1196 
1197       brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1198                                     block_size,
1199                                     inst->offset + block_size * REG_SIZE * i);
1200    }
1201 
1202    brw_pop_insn_state(p);
1203 }
1204 
1205 void
generate_scratch_read(fs_inst * inst,struct brw_reg dst)1206 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1207 {
1208    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1209    assert(inst->mlen != 0);
1210 
1211    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1212                                 inst->exec_size / 8, inst->offset);
1213 }
1214 
1215 void
generate_scratch_read_gen7(fs_inst * inst,struct brw_reg dst)1216 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1217 {
1218    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1219 
1220    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1221 }
1222 
1223 void
generate_uniform_pull_constant_load(fs_inst * inst,struct brw_reg dst,struct brw_reg index,struct brw_reg offset)1224 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1225                                                   struct brw_reg dst,
1226                                                   struct brw_reg index,
1227                                                   struct brw_reg offset)
1228 {
1229    assert(type_sz(dst.type) == 4);
1230    assert(inst->mlen != 0);
1231 
1232    assert(index.file == BRW_IMMEDIATE_VALUE &&
1233 	  index.type == BRW_REGISTER_TYPE_UD);
1234    uint32_t surf_index = index.ud;
1235 
1236    assert(offset.file == BRW_IMMEDIATE_VALUE &&
1237 	  offset.type == BRW_REGISTER_TYPE_UD);
1238    uint32_t read_offset = offset.ud;
1239 
1240    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1241 			read_offset, surf_index);
1242 }
1243 
1244 void
generate_uniform_pull_constant_load_gen7(fs_inst * inst,struct brw_reg dst,struct brw_reg index,struct brw_reg payload)1245 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1246                                                        struct brw_reg dst,
1247                                                        struct brw_reg index,
1248                                                        struct brw_reg payload)
1249 {
1250    assert(index.type == BRW_REGISTER_TYPE_UD);
1251    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1252    assert(type_sz(dst.type) == 4);
1253 
1254    if (index.file == BRW_IMMEDIATE_VALUE) {
1255       const uint32_t surf_index = index.ud;
1256 
1257       brw_push_insn_state(p);
1258       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1259       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1260       brw_pop_insn_state(p);
1261 
1262       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1263       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1264       brw_set_dp_read_message(p, send, surf_index,
1265                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1266                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1267                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1268                               1, /* mlen */
1269                               true, /* header */
1270                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
1271 
1272    } else {
1273       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1274 
1275       brw_push_insn_state(p);
1276       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1277 
1278       /* a0.0 = surf_index & 0xff */
1279       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1280       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1281       brw_set_dest(p, insn_and, addr);
1282       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1283       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1284 
1285       /* dst = send(payload, a0.0 | <descriptor>) */
1286       brw_inst *insn = brw_send_indirect_message(
1287          p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1288          retype(dst, BRW_REGISTER_TYPE_UD),
1289          retype(payload, BRW_REGISTER_TYPE_UD), addr);
1290       brw_set_dp_read_message(p, insn, 0 /* surface */,
1291                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1292                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1293                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1294                               1, /* mlen */
1295                               true, /* header */
1296                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
1297 
1298       brw_pop_insn_state(p);
1299    }
1300 }
1301 
1302 void
generate_varying_pull_constant_load_gen4(fs_inst * inst,struct brw_reg dst,struct brw_reg index)1303 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1304                                                        struct brw_reg dst,
1305                                                        struct brw_reg index)
1306 {
1307    assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1308    assert(inst->header_size != 0);
1309    assert(inst->mlen);
1310 
1311    assert(index.file == BRW_IMMEDIATE_VALUE &&
1312 	  index.type == BRW_REGISTER_TYPE_UD);
1313    uint32_t surf_index = index.ud;
1314 
1315    uint32_t simd_mode, rlen, msg_type;
1316    if (inst->exec_size == 16) {
1317       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1318       rlen = 8;
1319    } else {
1320       assert(inst->exec_size == 8);
1321       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1322       rlen = 4;
1323    }
1324 
1325    if (devinfo->gen >= 5)
1326       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1327    else {
1328       /* We always use the SIMD16 message so that we only have to load U, and
1329        * not V or R.
1330        */
1331       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1332       assert(inst->mlen == 3);
1333       assert(inst->size_written == 8 * REG_SIZE);
1334       rlen = 8;
1335       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1336    }
1337 
1338    struct brw_reg header = brw_vec8_grf(0, 0);
1339    gen6_resolve_implied_move(p, &header, inst->base_mrf);
1340 
1341    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1342    brw_inst_set_compression(devinfo, send, false);
1343    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1344    brw_set_src0(p, send, header);
1345    if (devinfo->gen < 6)
1346       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1347 
1348    /* Our surface is set up as floats, regardless of what actual data is
1349     * stored in it.
1350     */
1351    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1352    brw_set_sampler_message(p, send,
1353                            surf_index,
1354                            0, /* sampler (unused) */
1355                            msg_type,
1356                            rlen,
1357                            inst->mlen,
1358                            inst->header_size != 0,
1359                            simd_mode,
1360                            return_format);
1361 }
1362 
1363 void
generate_varying_pull_constant_load_gen7(fs_inst * inst,struct brw_reg dst,struct brw_reg index,struct brw_reg offset)1364 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1365                                                        struct brw_reg dst,
1366                                                        struct brw_reg index,
1367                                                        struct brw_reg offset)
1368 {
1369    assert(devinfo->gen >= 7);
1370    /* Varying-offset pull constant loads are treated as a normal expression on
1371     * gen7, so the fact that it's a send message is hidden at the IR level.
1372     */
1373    assert(inst->header_size == 0);
1374    assert(!inst->mlen);
1375    assert(index.type == BRW_REGISTER_TYPE_UD);
1376 
1377    uint32_t simd_mode, rlen, mlen;
1378    if (inst->exec_size == 16) {
1379       mlen = 2;
1380       rlen = 8;
1381       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1382    } else {
1383       assert(inst->exec_size == 8);
1384       mlen = 1;
1385       rlen = 4;
1386       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1387    }
1388 
1389    if (index.file == BRW_IMMEDIATE_VALUE) {
1390 
1391       uint32_t surf_index = index.ud;
1392 
1393       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1394       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1395       brw_set_src0(p, send, offset);
1396       brw_set_sampler_message(p, send,
1397                               surf_index,
1398                               0, /* LD message ignores sampler unit */
1399                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1400                               rlen,
1401                               mlen,
1402                               false, /* no header */
1403                               simd_mode,
1404                               0);
1405 
1406    } else {
1407 
1408       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1409 
1410       brw_push_insn_state(p);
1411       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1412 
1413       /* a0.0 = surf_index & 0xff */
1414       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1415       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1416       brw_set_dest(p, insn_and, addr);
1417       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1418       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1419 
1420       brw_pop_insn_state(p);
1421 
1422       /* dst = send(offset, a0.0 | <descriptor>) */
1423       brw_inst *insn = brw_send_indirect_message(
1424          p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1425          offset, addr);
1426       brw_set_sampler_message(p, insn,
1427                               0 /* surface */,
1428                               0 /* sampler */,
1429                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1430                               rlen /* rlen */,
1431                               mlen /* mlen */,
1432                               false /* header */,
1433                               simd_mode,
1434                               0);
1435    }
1436 }
1437 
1438 /**
1439  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1440  * into the flags register (f0.0).
1441  *
1442  * Used only on Gen6 and above.
1443  */
1444 void
generate_mov_dispatch_to_flags(fs_inst * inst)1445 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1446 {
1447    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1448    struct brw_reg dispatch_mask;
1449 
1450    if (devinfo->gen >= 6)
1451       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1452    else
1453       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1454 
1455    brw_push_insn_state(p);
1456    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1457    brw_set_default_exec_size(p, BRW_EXECUTE_1);
1458    brw_MOV(p, flags, dispatch_mask);
1459    brw_pop_insn_state(p);
1460 }
1461 
1462 void
generate_pixel_interpolator_query(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg msg_data,unsigned msg_type)1463 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1464                                                 struct brw_reg dst,
1465                                                 struct brw_reg src,
1466                                                 struct brw_reg msg_data,
1467                                                 unsigned msg_type)
1468 {
1469    assert(inst->size_written % REG_SIZE == 0);
1470    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1471 
1472    brw_pixel_interpolator_query(p,
1473          retype(dst, BRW_REGISTER_TYPE_UW),
1474          src,
1475          inst->pi_noperspective,
1476          msg_type,
1477          msg_data,
1478          inst->mlen,
1479          inst->size_written / REG_SIZE);
1480 }
1481 
1482 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1483  * the ADD instruction.
1484  */
1485 void
generate_set_sample_id(fs_inst * inst,struct brw_reg dst,struct brw_reg src0,struct brw_reg src1)1486 fs_generator::generate_set_sample_id(fs_inst *inst,
1487                                      struct brw_reg dst,
1488                                      struct brw_reg src0,
1489                                      struct brw_reg src1)
1490 {
1491    assert(dst.type == BRW_REGISTER_TYPE_D ||
1492           dst.type == BRW_REGISTER_TYPE_UD);
1493    assert(src0.type == BRW_REGISTER_TYPE_D ||
1494           src0.type == BRW_REGISTER_TYPE_UD);
1495 
1496    struct brw_reg reg = stride(src1, 1, 4, 0);
1497    if (devinfo->gen >= 8 || inst->exec_size == 8) {
1498       brw_ADD(p, dst, src0, reg);
1499    } else if (inst->exec_size == 16) {
1500       brw_push_insn_state(p);
1501       brw_set_default_exec_size(p, BRW_EXECUTE_8);
1502       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1503       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1504       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1505       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1506       brw_pop_insn_state(p);
1507    }
1508 }
1509 
1510 void
generate_pack_half_2x16_split(fs_inst * inst,struct brw_reg dst,struct brw_reg x,struct brw_reg y)1511 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1512                                             struct brw_reg dst,
1513                                             struct brw_reg x,
1514                                             struct brw_reg y)
1515 {
1516    assert(devinfo->gen >= 7);
1517    assert(dst.type == BRW_REGISTER_TYPE_UD);
1518    assert(x.type == BRW_REGISTER_TYPE_F);
1519    assert(y.type == BRW_REGISTER_TYPE_F);
1520 
1521    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1522     *
1523     *   Because this instruction does not have a 16-bit floating-point type,
1524     *   the destination data type must be Word (W).
1525     *
1526     *   The destination must be DWord-aligned and specify a horizontal stride
1527     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1528     *   each destination channel and the upper word is not modified.
1529     */
1530    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1531 
1532    /* Give each 32-bit channel of dst the form below, where "." means
1533     * unchanged.
1534     *   0x....hhhh
1535     */
1536    brw_F32TO16(p, dst_w, y);
1537 
1538    /* Now the form:
1539     *   0xhhhh0000
1540     */
1541    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1542 
1543    /* And, finally the form of packHalf2x16's output:
1544     *   0xhhhhllll
1545     */
1546    brw_F32TO16(p, dst_w, x);
1547 }
1548 
1549 void
generate_unpack_half_2x16_split(fs_inst * inst,struct brw_reg dst,struct brw_reg src)1550 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1551                                               struct brw_reg dst,
1552                                               struct brw_reg src)
1553 {
1554    assert(devinfo->gen >= 7);
1555    assert(dst.type == BRW_REGISTER_TYPE_F);
1556    assert(src.type == BRW_REGISTER_TYPE_UD);
1557 
1558    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1559     *
1560     *   Because this instruction does not have a 16-bit floating-point type,
1561     *   the source data type must be Word (W). The destination type must be
1562     *   F (Float).
1563     */
1564    struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1565 
1566    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1567     * For the Y case, we wish to access only the upper word; therefore
1568     * a 16-bit subregister offset is needed.
1569     */
1570    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1571           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1572    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1573       src_w.subnr += 2;
1574 
1575    brw_F16TO32(p, dst, src_w);
1576 }
1577 
1578 void
generate_shader_time_add(fs_inst * inst,struct brw_reg payload,struct brw_reg offset,struct brw_reg value)1579 fs_generator::generate_shader_time_add(fs_inst *inst,
1580                                        struct brw_reg payload,
1581                                        struct brw_reg offset,
1582                                        struct brw_reg value)
1583 {
1584    assert(devinfo->gen >= 7);
1585    brw_push_insn_state(p);
1586    brw_set_default_mask_control(p, true);
1587 
1588    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1589    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1590                                           offset.type);
1591    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1592                                          value.type);
1593 
1594    assert(offset.file == BRW_IMMEDIATE_VALUE);
1595    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1596       value.width = BRW_WIDTH_1;
1597       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1598       value.vstride = BRW_VERTICAL_STRIDE_0;
1599    } else {
1600       assert(value.file == BRW_IMMEDIATE_VALUE);
1601    }
1602 
1603    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1604     * case, and we don't really care about squeezing every bit of performance
1605     * out of this path, so we just emit the MOVs from here.
1606     */
1607    brw_MOV(p, payload_offset, offset);
1608    brw_MOV(p, payload_value, value);
1609    brw_shader_time_add(p, payload,
1610                        prog_data->binding_table.shader_time_start);
1611    brw_pop_insn_state(p);
1612 
1613    brw_mark_surface_used(prog_data,
1614                          prog_data->binding_table.shader_time_start);
1615 }
1616 
1617 void
enable_debug(const char * shader_name)1618 fs_generator::enable_debug(const char *shader_name)
1619 {
1620    debug_flag = true;
1621    this->shader_name = shader_name;
1622 }
1623 
1624 int
generate_code(const cfg_t * cfg,int dispatch_width)1625 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1626 {
1627    /* align to 64 byte boundary. */
1628    while (p->next_insn_offset % 64)
1629       brw_NOP(p);
1630 
1631    this->dispatch_width = dispatch_width;
1632 
1633    int start_offset = p->next_insn_offset;
1634    int spill_count = 0, fill_count = 0;
1635    int loop_count = 0;
1636 
1637    struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1638 
1639    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1640       struct brw_reg src[3], dst;
1641       unsigned int last_insn_offset = p->next_insn_offset;
1642       bool multiple_instructions_emitted = false;
1643 
1644       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1645        * "Register Region Restrictions" section: for BDW, SKL:
1646        *
1647        *    "A POW/FDIV operation must not be followed by an instruction
1648        *     that requires two destination registers."
1649        *
1650        * The documentation is often lacking annotations for Atom parts,
1651        * and empirically this affects CHV as well.
1652        */
1653       if (devinfo->gen >= 8 &&
1654           devinfo->gen <= 9 &&
1655           p->nr_insn > 1 &&
1656           brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1657           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1658           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1659          brw_NOP(p);
1660          last_insn_offset = p->next_insn_offset;
1661       }
1662 
1663       if (unlikely(debug_flag))
1664          disasm_annotate(disasm_info, inst, p->next_insn_offset);
1665 
1666       /* If the instruction writes to more than one register, it needs to be
1667        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1668        * hardware figures out by itself what the right compression mode is,
1669        * but we still need to know whether the instruction is compressed to
1670        * set up the source register regions appropriately.
1671        *
1672        * XXX - This is wrong for instructions that write a single register but
1673        *       read more than one which should strictly speaking be treated as
1674        *       compressed.  For instructions that don't write any registers it
1675        *       relies on the destination being a null register of the correct
1676        *       type and regioning so the instruction is considered compressed
1677        *       or not accordingly.
1678        */
1679       const bool compressed =
1680            inst->dst.component_size(inst->exec_size) > REG_SIZE;
1681       brw_set_default_compression(p, compressed);
1682       brw_set_default_group(p, inst->group);
1683 
1684       for (unsigned int i = 0; i < inst->sources; i++) {
1685          src[i] = brw_reg_from_fs_reg(devinfo, inst,
1686                                       &inst->src[i], compressed);
1687 	 /* The accumulator result appears to get used for the
1688 	  * conditional modifier generation.  When negating a UD
1689 	  * value, there is a 33rd bit generated for the sign in the
1690 	  * accumulator value, so now you can't check, for example,
1691 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1692 	  */
1693 	 assert(!inst->conditional_mod ||
1694 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1695 		!inst->src[i].negate);
1696       }
1697       dst = brw_reg_from_fs_reg(devinfo, inst,
1698                                 &inst->dst, compressed);
1699 
1700       brw_set_default_access_mode(p, BRW_ALIGN_1);
1701       brw_set_default_predicate_control(p, inst->predicate);
1702       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1703       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1704       brw_set_default_saturate(p, inst->saturate);
1705       brw_set_default_mask_control(p, inst->force_writemask_all);
1706       brw_set_default_acc_write_control(p, inst->writes_accumulator);
1707 
1708       unsigned exec_size = inst->exec_size;
1709       if (devinfo->gen == 7 && !devinfo->is_haswell &&
1710           (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1711          exec_size *= 2;
1712       }
1713 
1714       brw_set_default_exec_size(p, cvt(exec_size) - 1);
1715 
1716       assert(inst->force_writemask_all || inst->exec_size >= 4);
1717       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1718       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1719       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1720 
1721       switch (inst->opcode) {
1722       case BRW_OPCODE_MOV:
1723 	 brw_MOV(p, dst, src[0]);
1724 	 break;
1725       case BRW_OPCODE_ADD:
1726 	 brw_ADD(p, dst, src[0], src[1]);
1727 	 break;
1728       case BRW_OPCODE_MUL:
1729 	 brw_MUL(p, dst, src[0], src[1]);
1730 	 break;
1731       case BRW_OPCODE_AVG:
1732 	 brw_AVG(p, dst, src[0], src[1]);
1733 	 break;
1734       case BRW_OPCODE_MACH:
1735 	 brw_MACH(p, dst, src[0], src[1]);
1736 	 break;
1737 
1738       case BRW_OPCODE_LINE:
1739          brw_LINE(p, dst, src[0], src[1]);
1740          break;
1741 
1742       case BRW_OPCODE_MAD:
1743          assert(devinfo->gen >= 6);
1744          if (devinfo->gen < 10)
1745             brw_set_default_access_mode(p, BRW_ALIGN_16);
1746          brw_MAD(p, dst, src[0], src[1], src[2]);
1747 	 break;
1748 
1749       case BRW_OPCODE_LRP:
1750          assert(devinfo->gen >= 6);
1751          if (devinfo->gen < 10)
1752             brw_set_default_access_mode(p, BRW_ALIGN_16);
1753          brw_LRP(p, dst, src[0], src[1], src[2]);
1754 	 break;
1755 
1756       case BRW_OPCODE_FRC:
1757 	 brw_FRC(p, dst, src[0]);
1758 	 break;
1759       case BRW_OPCODE_RNDD:
1760 	 brw_RNDD(p, dst, src[0]);
1761 	 break;
1762       case BRW_OPCODE_RNDE:
1763 	 brw_RNDE(p, dst, src[0]);
1764 	 break;
1765       case BRW_OPCODE_RNDZ:
1766 	 brw_RNDZ(p, dst, src[0]);
1767 	 break;
1768 
1769       case BRW_OPCODE_AND:
1770 	 brw_AND(p, dst, src[0], src[1]);
1771 	 break;
1772       case BRW_OPCODE_OR:
1773 	 brw_OR(p, dst, src[0], src[1]);
1774 	 break;
1775       case BRW_OPCODE_XOR:
1776 	 brw_XOR(p, dst, src[0], src[1]);
1777 	 break;
1778       case BRW_OPCODE_NOT:
1779 	 brw_NOT(p, dst, src[0]);
1780 	 break;
1781       case BRW_OPCODE_ASR:
1782 	 brw_ASR(p, dst, src[0], src[1]);
1783 	 break;
1784       case BRW_OPCODE_SHR:
1785 	 brw_SHR(p, dst, src[0], src[1]);
1786 	 break;
1787       case BRW_OPCODE_SHL:
1788 	 brw_SHL(p, dst, src[0], src[1]);
1789 	 break;
1790       case BRW_OPCODE_F32TO16:
1791          assert(devinfo->gen >= 7);
1792          brw_F32TO16(p, dst, src[0]);
1793          break;
1794       case BRW_OPCODE_F16TO32:
1795          assert(devinfo->gen >= 7);
1796          brw_F16TO32(p, dst, src[0]);
1797          break;
1798       case BRW_OPCODE_CMP:
1799          if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1800              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1801             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1802              * implemented in the compiler is not sufficient. Overriding the
1803              * type when the destination is the null register is necessary but
1804              * not sufficient by itself.
1805              */
1806             assert(dst.nr == BRW_ARF_NULL);
1807             dst.type = BRW_REGISTER_TYPE_D;
1808          }
1809          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1810 	 break;
1811       case BRW_OPCODE_SEL:
1812 	 brw_SEL(p, dst, src[0], src[1]);
1813 	 break;
1814       case BRW_OPCODE_BFREV:
1815          assert(devinfo->gen >= 7);
1816          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1817                    retype(src[0], BRW_REGISTER_TYPE_UD));
1818          break;
1819       case BRW_OPCODE_FBH:
1820          assert(devinfo->gen >= 7);
1821          brw_FBH(p, retype(dst, src[0].type), src[0]);
1822          break;
1823       case BRW_OPCODE_FBL:
1824          assert(devinfo->gen >= 7);
1825          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1826                  retype(src[0], BRW_REGISTER_TYPE_UD));
1827          break;
1828       case BRW_OPCODE_LZD:
1829          brw_LZD(p, dst, src[0]);
1830          break;
1831       case BRW_OPCODE_CBIT:
1832          assert(devinfo->gen >= 7);
1833          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1834                   retype(src[0], BRW_REGISTER_TYPE_UD));
1835          break;
1836       case BRW_OPCODE_ADDC:
1837          assert(devinfo->gen >= 7);
1838          brw_ADDC(p, dst, src[0], src[1]);
1839          break;
1840       case BRW_OPCODE_SUBB:
1841          assert(devinfo->gen >= 7);
1842          brw_SUBB(p, dst, src[0], src[1]);
1843          break;
1844       case BRW_OPCODE_MAC:
1845          brw_MAC(p, dst, src[0], src[1]);
1846          break;
1847 
1848       case BRW_OPCODE_BFE:
1849          assert(devinfo->gen >= 7);
1850          if (devinfo->gen < 10)
1851             brw_set_default_access_mode(p, BRW_ALIGN_16);
1852          brw_BFE(p, dst, src[0], src[1], src[2]);
1853          break;
1854 
1855       case BRW_OPCODE_BFI1:
1856          assert(devinfo->gen >= 7);
1857          brw_BFI1(p, dst, src[0], src[1]);
1858          break;
1859       case BRW_OPCODE_BFI2:
1860          assert(devinfo->gen >= 7);
1861          if (devinfo->gen < 10)
1862             brw_set_default_access_mode(p, BRW_ALIGN_16);
1863          brw_BFI2(p, dst, src[0], src[1], src[2]);
1864          break;
1865 
1866       case BRW_OPCODE_IF:
1867 	 if (inst->src[0].file != BAD_FILE) {
1868 	    /* The instruction has an embedded compare (only allowed on gen6) */
1869 	    assert(devinfo->gen == 6);
1870 	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1871 	 } else {
1872 	    brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1873 	 }
1874 	 break;
1875 
1876       case BRW_OPCODE_ELSE:
1877 	 brw_ELSE(p);
1878 	 break;
1879       case BRW_OPCODE_ENDIF:
1880 	 brw_ENDIF(p);
1881 	 break;
1882 
1883       case BRW_OPCODE_DO:
1884 	 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1885 	 break;
1886 
1887       case BRW_OPCODE_BREAK:
1888 	 brw_BREAK(p);
1889 	 break;
1890       case BRW_OPCODE_CONTINUE:
1891          brw_CONT(p);
1892 	 break;
1893 
1894       case BRW_OPCODE_WHILE:
1895 	 brw_WHILE(p);
1896          loop_count++;
1897 	 break;
1898 
1899       case SHADER_OPCODE_RCP:
1900       case SHADER_OPCODE_RSQ:
1901       case SHADER_OPCODE_SQRT:
1902       case SHADER_OPCODE_EXP2:
1903       case SHADER_OPCODE_LOG2:
1904       case SHADER_OPCODE_SIN:
1905       case SHADER_OPCODE_COS:
1906          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1907 	 if (devinfo->gen >= 6) {
1908             assert(inst->mlen == 0);
1909             assert(devinfo->gen >= 7 || inst->exec_size == 8);
1910             gen6_math(p, dst, brw_math_function(inst->opcode),
1911                       src[0], brw_null_reg());
1912 	 } else {
1913             assert(inst->mlen >= 1);
1914             assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1915             gen4_math(p, dst,
1916                       brw_math_function(inst->opcode),
1917                       inst->base_mrf, src[0],
1918                       BRW_MATH_PRECISION_FULL);
1919 	 }
1920 	 break;
1921       case SHADER_OPCODE_INT_QUOTIENT:
1922       case SHADER_OPCODE_INT_REMAINDER:
1923       case SHADER_OPCODE_POW:
1924          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1925          if (devinfo->gen >= 6) {
1926             assert(inst->mlen == 0);
1927             assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1928                    inst->exec_size == 8);
1929             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1930          } else {
1931             assert(inst->mlen >= 1);
1932             assert(inst->exec_size == 8);
1933             gen4_math(p, dst, brw_math_function(inst->opcode),
1934                       inst->base_mrf, src[0],
1935                       BRW_MATH_PRECISION_FULL);
1936 	 }
1937 	 break;
1938       case FS_OPCODE_CINTERP:
1939 	 brw_MOV(p, dst, src[0]);
1940 	 break;
1941       case FS_OPCODE_LINTERP:
1942 	 generate_linterp(inst, dst, src);
1943 	 break;
1944       case FS_OPCODE_PIXEL_X:
1945          assert(src[0].type == BRW_REGISTER_TYPE_UW);
1946          src[0].subnr = 0 * type_sz(src[0].type);
1947          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1948          break;
1949       case FS_OPCODE_PIXEL_Y:
1950          assert(src[0].type == BRW_REGISTER_TYPE_UW);
1951          src[0].subnr = 4 * type_sz(src[0].type);
1952          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1953          break;
1954       case SHADER_OPCODE_GET_BUFFER_SIZE:
1955          generate_get_buffer_size(inst, dst, src[0], src[1]);
1956          break;
1957       case SHADER_OPCODE_TEX:
1958       case FS_OPCODE_TXB:
1959       case SHADER_OPCODE_TXD:
1960       case SHADER_OPCODE_TXF:
1961       case SHADER_OPCODE_TXF_LZ:
1962       case SHADER_OPCODE_TXF_CMS:
1963       case SHADER_OPCODE_TXF_CMS_W:
1964       case SHADER_OPCODE_TXF_UMS:
1965       case SHADER_OPCODE_TXF_MCS:
1966       case SHADER_OPCODE_TXL:
1967       case SHADER_OPCODE_TXL_LZ:
1968       case SHADER_OPCODE_TXS:
1969       case SHADER_OPCODE_LOD:
1970       case SHADER_OPCODE_TG4:
1971       case SHADER_OPCODE_TG4_OFFSET:
1972       case SHADER_OPCODE_SAMPLEINFO:
1973 	 generate_tex(inst, dst, src[0], src[1], src[2]);
1974 	 break;
1975       case FS_OPCODE_DDX_COARSE:
1976       case FS_OPCODE_DDX_FINE:
1977          generate_ddx(inst->opcode, dst, src[0]);
1978          break;
1979       case FS_OPCODE_DDY_COARSE:
1980       case FS_OPCODE_DDY_FINE:
1981          generate_ddy(inst->opcode, dst, src[0]);
1982 	 break;
1983 
1984       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1985 	 generate_scratch_write(inst, src[0]);
1986          spill_count++;
1987 	 break;
1988 
1989       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1990 	 generate_scratch_read(inst, dst);
1991          fill_count++;
1992 	 break;
1993 
1994       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1995 	 generate_scratch_read_gen7(inst, dst);
1996          fill_count++;
1997 	 break;
1998 
1999       case SHADER_OPCODE_MOV_INDIRECT:
2000          generate_mov_indirect(inst, dst, src[0], src[1]);
2001          break;
2002 
2003       case SHADER_OPCODE_URB_READ_SIMD8:
2004       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2005          generate_urb_read(inst, dst, src[0]);
2006          break;
2007 
2008       case SHADER_OPCODE_URB_WRITE_SIMD8:
2009       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2010       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2011       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2012 	 generate_urb_write(inst, src[0]);
2013 	 break;
2014 
2015       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2016          assert(inst->force_writemask_all);
2017 	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2018 	 break;
2019 
2020       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2021          assert(inst->force_writemask_all);
2022 	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2023 	 break;
2024 
2025       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
2026 	 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
2027 	 break;
2028 
2029       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
2030 	 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2031 	 break;
2032 
2033       case FS_OPCODE_REP_FB_WRITE:
2034       case FS_OPCODE_FB_WRITE:
2035 	 generate_fb_write(inst, src[0]);
2036 	 break;
2037 
2038       case FS_OPCODE_FB_READ:
2039          generate_fb_read(inst, dst, src[0]);
2040          break;
2041 
2042       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2043          generate_mov_dispatch_to_flags(inst);
2044          break;
2045 
2046       case FS_OPCODE_DISCARD_JUMP:
2047          generate_discard_jump(inst);
2048          break;
2049 
2050       case SHADER_OPCODE_SHADER_TIME_ADD:
2051          generate_shader_time_add(inst, src[0], src[1], src[2]);
2052          break;
2053 
2054       case SHADER_OPCODE_UNTYPED_ATOMIC:
2055          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2056          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
2057                             inst->mlen, !inst->dst.is_null());
2058          break;
2059 
2060       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2061          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2062          brw_untyped_surface_read(p, dst, src[0], src[1],
2063                                   inst->mlen, src[2].ud);
2064          break;
2065 
2066       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2067          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2068          brw_untyped_surface_write(p, src[0], src[1],
2069                                    inst->mlen, src[2].ud);
2070          break;
2071 
2072       case SHADER_OPCODE_BYTE_SCATTERED_READ:
2073          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2074          brw_byte_scattered_read(p, dst, src[0], src[1],
2075                                  inst->mlen, src[2].ud);
2076          break;
2077 
2078       case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
2079          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2080          brw_byte_scattered_write(p, src[0], src[1],
2081                                   inst->mlen, src[2].ud);
2082          break;
2083 
2084       case SHADER_OPCODE_TYPED_ATOMIC:
2085          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2086          brw_typed_atomic(p, dst, src[0], src[1],
2087                           src[2].ud, inst->mlen, !inst->dst.is_null());
2088          break;
2089 
2090       case SHADER_OPCODE_TYPED_SURFACE_READ:
2091          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2092          brw_typed_surface_read(p, dst, src[0], src[1],
2093                                 inst->mlen, src[2].ud);
2094          break;
2095 
2096       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2097          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2098          brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2099          break;
2100 
2101       case SHADER_OPCODE_MEMORY_FENCE:
2102          brw_memory_fence(p, dst);
2103          break;
2104 
2105       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2106          const struct brw_reg mask =
2107             brw_stage_has_packed_dispatch(devinfo, stage,
2108                                           prog_data) ? brw_imm_ud(~0u) :
2109             stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2110             brw_dmask_reg();
2111          brw_find_live_channel(p, dst, mask);
2112          break;
2113       }
2114 
2115       case SHADER_OPCODE_BROADCAST:
2116          assert(inst->force_writemask_all);
2117          brw_broadcast(p, dst, src[0], src[1]);
2118          break;
2119 
2120       case FS_OPCODE_SET_SAMPLE_ID:
2121          generate_set_sample_id(inst, dst, src[0], src[1]);
2122          break;
2123 
2124       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2125           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2126           break;
2127 
2128       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2129       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2130          generate_unpack_half_2x16_split(inst, dst, src[0]);
2131          break;
2132 
2133       case FS_OPCODE_PLACEHOLDER_HALT:
2134          /* This is the place where the final HALT needs to be inserted if
2135           * we've emitted any discards.  If not, this will emit no code.
2136           */
2137          if (!patch_discard_jumps_to_fb_writes()) {
2138             if (unlikely(debug_flag)) {
2139                disasm_info->use_tail = true;
2140             }
2141          }
2142          break;
2143 
2144       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2145          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2146                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2147          break;
2148 
2149       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2150          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2151                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2152          break;
2153 
2154       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2155          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2156                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2157          break;
2158 
2159       case CS_OPCODE_CS_TERMINATE:
2160          generate_cs_terminate(inst, src[0]);
2161          break;
2162 
2163       case SHADER_OPCODE_BARRIER:
2164 	 generate_barrier(inst, src[0]);
2165 	 break;
2166 
2167       case BRW_OPCODE_DIM:
2168          assert(devinfo->is_haswell);
2169          assert(src[0].type == BRW_REGISTER_TYPE_DF);
2170          assert(dst.type == BRW_REGISTER_TYPE_DF);
2171          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2172          break;
2173 
2174       case SHADER_OPCODE_RND_MODE:
2175          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2176          brw_rounding_mode(p, (brw_rnd_mode) src[0].d);
2177          break;
2178 
2179       default:
2180          unreachable("Unsupported opcode");
2181 
2182       case SHADER_OPCODE_LOAD_PAYLOAD:
2183          unreachable("Should be lowered by lower_load_payload()");
2184       }
2185 
2186       if (multiple_instructions_emitted)
2187          continue;
2188 
2189       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2190          assert(p->next_insn_offset == last_insn_offset + 16 ||
2191                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2192                  "emitting more than 1 instruction");
2193 
2194          brw_inst *last = &p->store[last_insn_offset / 16];
2195 
2196          if (inst->conditional_mod)
2197             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2198          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2199          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2200       }
2201    }
2202 
2203    brw_set_uip_jip(p, start_offset);
2204 
2205    /* end of program sentinel */
2206    disasm_new_inst_group(disasm_info, p->next_insn_offset);
2207 
2208 #ifndef NDEBUG
2209    bool validated =
2210 #else
2211    if (unlikely(debug_flag))
2212 #endif
2213       brw_validate_instructions(devinfo, p->store,
2214                                 start_offset,
2215                                 p->next_insn_offset,
2216                                 disasm_info);
2217 
2218    int before_size = p->next_insn_offset - start_offset;
2219    brw_compact_instructions(p, start_offset, disasm_info);
2220    int after_size = p->next_insn_offset - start_offset;
2221 
2222    if (unlikely(debug_flag)) {
2223       fprintf(stderr, "Native code for %s\n"
2224               "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2225               " bytes (%.0f%%)\n",
2226               shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2227               spill_count, fill_count, promoted_constants, before_size, after_size,
2228               100.0f * (before_size - after_size) / before_size);
2229 
2230       dump_assembly(p->store, disasm_info);
2231    }
2232    ralloc_free(disasm_info);
2233    assert(validated);
2234 
2235    compiler->shader_debug_log(log_data,
2236                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2237                               "%d:%d spills:fills, Promoted %u constants, "
2238                               "compacted %d to %d bytes.",
2239                               _mesa_shader_stage_to_abbrev(stage),
2240                               dispatch_width, before_size / 16,
2241                               loop_count, cfg->cycle_count, spill_count,
2242                               fill_count, promoted_constants, before_size,
2243                               after_size);
2244 
2245    return start_offset;
2246 }
2247 
2248 const unsigned *
get_assembly(unsigned int * assembly_size)2249 fs_generator::get_assembly(unsigned int *assembly_size)
2250 {
2251    return brw_get_program(p, assembly_size);
2252 }
2253