1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keith@tungstengraphics.com>
30   */
31 
32 #include <string.h>
33 
34 #include "brw_context.h"
35 #include "brw_defines.h"
36 #include "brw_eu.h"
37 
38 #include "ralloc.h"
39 
40 /***********************************************************************
41  * Internal helper for constructing instructions
42  */
43 
guess_execution_size(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)44 static void guess_execution_size(struct brw_compile *p,
45 				 struct brw_instruction *insn,
46 				 struct brw_reg reg)
47 {
48    if (reg.width == BRW_WIDTH_8 && p->compressed)
49       insn->header.execution_size = BRW_EXECUTE_16;
50    else
51       insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52 }
53 
54 
55 /**
56  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
57  * registers, implicitly moving the operand to a message register.
58  *
59  * On Sandybridge, this is no longer the case.  This function performs the
60  * explicit move; it should be called before emitting a SEND instruction.
61  */
62 void
gen6_resolve_implied_move(struct brw_compile * p,struct brw_reg * src,unsigned msg_reg_nr)63 gen6_resolve_implied_move(struct brw_compile *p,
64 			  struct brw_reg *src,
65 			  unsigned msg_reg_nr)
66 {
67    struct intel_context *intel = &p->brw->intel;
68    if (intel->gen < 6)
69       return;
70 
71    if (src->file == BRW_MESSAGE_REGISTER_FILE)
72       return;
73 
74    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
75       brw_push_insn_state(p);
76       brw_set_mask_control(p, BRW_MASK_DISABLE);
77       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
78       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
79 	      retype(*src, BRW_REGISTER_TYPE_UD));
80       brw_pop_insn_state(p);
81    }
82    *src = brw_message_reg(msg_reg_nr);
83 }
84 
85 static void
gen7_convert_mrf_to_grf(struct brw_compile * p,struct brw_reg * reg)86 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
87 {
88    /* From the BSpec / ISA Reference / send - [DevIVB+]:
89     * "The send with EOT should use register space R112-R127 for <src>. This is
90     *  to enable loading of a new thread into the same slot while the message
91     *  with EOT for current thread is pending dispatch."
92     *
93     * Since we're pretending to have 16 MRFs anyway, we may as well use the
94     * registers required for messages with EOT.
95     */
96    struct intel_context *intel = &p->brw->intel;
97    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
98       reg->file = BRW_GENERAL_REGISTER_FILE;
99       reg->nr += GEN7_MRF_HACK_START;
100    }
101 }
102 
103 
104 void
brw_set_dest(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg dest)105 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
106 	     struct brw_reg dest)
107 {
108    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
109        dest.file != BRW_MESSAGE_REGISTER_FILE)
110       assert(dest.nr < 128);
111 
112    gen7_convert_mrf_to_grf(p, &dest);
113 
114    insn->bits1.da1.dest_reg_file = dest.file;
115    insn->bits1.da1.dest_reg_type = dest.type;
116    insn->bits1.da1.dest_address_mode = dest.address_mode;
117 
118    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
119       insn->bits1.da1.dest_reg_nr = dest.nr;
120 
121       if (insn->header.access_mode == BRW_ALIGN_1) {
122 	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
123 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
124 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
125 	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
126       }
127       else {
128 	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
129 	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
130 	 /* even ignored in da16, still need to set as '01' */
131 	 insn->bits1.da16.dest_horiz_stride = 1;
132       }
133    }
134    else {
135       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
136 
137       /* These are different sizes in align1 vs align16:
138        */
139       if (insn->header.access_mode == BRW_ALIGN_1) {
140 	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
141 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
142 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
143 	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
144       }
145       else {
146 	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
147 	 /* even ignored in da16, still need to set as '01' */
148 	 insn->bits1.ia16.dest_horiz_stride = 1;
149       }
150    }
151 
152    /* NEW: Set the execution size based on dest.width and
153     * insn->compression_control:
154     */
155    guess_execution_size(p, insn, dest);
156 }
157 
158 extern int reg_type_size[];
159 
160 static void
validate_reg(struct brw_instruction * insn,struct brw_reg reg)161 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
162 {
163    int hstride_for_reg[] = {0, 1, 2, 4};
164    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
165    int width_for_reg[] = {1, 2, 4, 8, 16};
166    int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
167    int width, hstride, vstride, execsize;
168 
169    if (reg.file == BRW_IMMEDIATE_VALUE) {
170       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
171        * mean the destination has to be 128-bit aligned and the
172        * destination horiz stride has to be a word.
173        */
174       if (reg.type == BRW_REGISTER_TYPE_V) {
175 	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
176 		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
177       }
178 
179       return;
180    }
181 
182    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
183        reg.nr == BRW_ARF_NULL)
184       return;
185 
186    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
187    hstride = hstride_for_reg[reg.hstride];
188 
189    if (reg.vstride == 0xf) {
190       vstride = -1;
191    } else {
192       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
193       vstride = vstride_for_reg[reg.vstride];
194    }
195 
196    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
197    width = width_for_reg[reg.width];
198 
199    assert(insn->header.execution_size >= 0 &&
200 	  insn->header.execution_size < Elements(execsize_for_reg));
201    execsize = execsize_for_reg[insn->header.execution_size];
202 
203    /* Restrictions from 3.3.10: Register Region Restrictions. */
204    /* 3. */
205    assert(execsize >= width);
206 
207    /* FIXME: the assembler has a lot of code written that triggers the
208     * assertions commented it below. Let's paper over it (for now!) until we
209     * can re-validate the shaders with those little inconsistencies fixed. */
210 
211    /* 4. */
212 #if 0
213    if (execsize == width && hstride != 0) {
214       assert(vstride == -1 || vstride == width * hstride);
215    }
216 #endif
217 
218    /* 5. */
219    if (execsize == width && hstride == 0) {
220       /* no restriction on vstride. */
221    }
222 
223    /* 6. */
224 #if 0
225    if (width == 1) {
226       assert(hstride == 0);
227    }
228 #endif
229 
230    /* 7. */
231 #if 0
232    if (execsize == 1 && width == 1) {
233       assert(hstride == 0);
234       assert(vstride == 0);
235    }
236 #endif
237 
238    /* 8. */
239    if (vstride == 0 && hstride == 0) {
240       assert(width == 1);
241    }
242 
243    /* 10. Check destination issues. */
244 }
245 
246 void
brw_set_src0(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)247 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
248 	     struct brw_reg reg)
249 {
250    struct brw_context *brw = p->brw;
251    struct intel_context *intel = &brw->intel;
252 
253    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
254       assert(reg.nr < 128);
255 
256    gen7_convert_mrf_to_grf(p, &reg);
257 
258    if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
259                            insn->header.opcode == BRW_OPCODE_SENDC)) {
260       /* Any source modifiers or regions will be ignored, since this just
261        * identifies the MRF/GRF to start reading the message contents from.
262        * Check for some likely failures.
263        */
264       assert(!reg.negate);
265       assert(!reg.abs);
266       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
267    }
268 
269    validate_reg(insn, reg);
270 
271    insn->bits1.da1.src0_reg_file = reg.file;
272    insn->bits1.da1.src0_reg_type = reg.type;
273    insn->bits2.da1.src0_abs = reg.abs;
274    insn->bits2.da1.src0_negate = reg.negate;
275    insn->bits2.da1.src0_address_mode = reg.address_mode;
276 
277    if (reg.file == BRW_IMMEDIATE_VALUE) {
278       insn->bits3.ud = reg.dw1.ud;
279 
280       /* Required to set some fields in src1 as well:
281        */
282 
283       /* FIXME: This looks quite wrong, tempering with src1. I did not find
284        * anything in the bspec that was hinting it woud be needed when setting
285        * src0. before removing this one needs to run piglit.
286 
287       insn->bits1.da1.src1_reg_file = 0;
288       insn->bits1.da1.src1_reg_type = reg.type;
289        */
290    }
291    else
292    {
293       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
294 	 if (insn->header.access_mode == BRW_ALIGN_1) {
295 	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
296 	    insn->bits2.da1.src0_reg_nr = reg.nr;
297 	 }
298 	 else {
299 	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
300 	    insn->bits2.da16.src0_reg_nr = reg.nr;
301 	 }
302       }
303       else {
304 	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
305 
306 	 if (insn->header.access_mode == BRW_ALIGN_1) {
307 	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
308 	 }
309 	 else {
310 	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
311 	 }
312       }
313 
314       if (insn->header.access_mode == BRW_ALIGN_1) {
315 
316 	 /* FIXME: While this is correct, if the assembler uses that code path
317 	  * the opcode generated are different and thus needs a validation
318 	  * pass.
319 	 if (reg.width == BRW_WIDTH_1 &&
320 	     insn->header.execution_size == BRW_EXECUTE_1) {
321 	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
322 	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
323 	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
324 	 }
325 	 else {
326          */
327 	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
328 	    insn->bits2.da1.src0_width = reg.width;
329 	    insn->bits2.da1.src0_vert_stride = reg.vstride;
330      /* } */
331       }
332       else {
333 	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
334 	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
335 	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
336 	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
337 
338 	 /* This is an oddity of the fact we're using the same
339 	  * descriptions for registers in align_16 as align_1:
340 	  */
341 	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
342 	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
343 	 else
344 	    insn->bits2.da16.src0_vert_stride = reg.vstride;
345       }
346    }
347 }
348 
349 
brw_set_src1(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg reg)350 void brw_set_src1(struct brw_compile *p,
351 		  struct brw_instruction *insn,
352 		  struct brw_reg reg)
353 {
354    struct brw_context *brw = p->brw;
355    struct intel_context *intel = &brw->intel;
356 
357    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
358 
359    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
360       assert(reg.nr < 128);
361 
362    gen7_convert_mrf_to_grf(p, &reg);
363 
364    validate_reg(insn, reg);
365 
366    insn->bits1.da1.src1_reg_file = reg.file;
367    insn->bits1.da1.src1_reg_type = reg.type;
368    insn->bits3.da1.src1_abs = reg.abs;
369    insn->bits3.da1.src1_negate = reg.negate;
370    insn->bits3.da1.src1_address_mode = reg.address_mode;
371 
372    /* Only src1 can be immediate in two-argument instructions.
373     */
374    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
375 
376    if (reg.file == BRW_IMMEDIATE_VALUE) {
377       insn->bits3.ud = reg.dw1.ud;
378    }
379    else {
380       /* It's only BRW that does not support register-indirect addressing on
381        * src1 */
382       assert (intel->gen >= 4 || reg.address_mode == BRW_ADDRESS_DIRECT);
383 
384       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
385 	 if (insn->header.access_mode == BRW_ALIGN_1) {
386 	    insn->bits3.da1.src1_subreg_nr = reg.subnr;
387 	    insn->bits3.da1.src1_reg_nr = reg.nr;
388 	 }
389 	 else {
390 	    insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
391 	    insn->bits3.da16.src1_reg_nr = reg.nr;
392 	 }
393       }
394       else {
395 	 insn->bits3.ia1.src1_subreg_nr = reg.subnr;
396 
397 	 if (insn->header.access_mode == BRW_ALIGN_1)
398 	    insn->bits3.ia1.src1_indirect_offset = reg.dw1.bits.indirect_offset;
399 	 else
400 	    insn->bits3.ia16.src1_indirect_offset = reg.dw1.bits.indirect_offset / 16;
401       }
402 
403       if (insn->header.access_mode == BRW_ALIGN_1) {
404 	 /* FIXME: While this is correct, if the assembler uses that code path
405 	  * the opcode generated are different and thus needs a validation
406 	  * pass.
407 	 if (reg.width == BRW_WIDTH_1 &&
408 	     insn->header.execution_size == BRW_EXECUTE_1) {
409 	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
410 	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
411 	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
412 	 }
413 	 else { */
414 	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
415 	    insn->bits3.da1.src1_width = reg.width;
416 	    insn->bits3.da1.src1_vert_stride = reg.vstride;
417      /* } */
418       }
419       else {
420 	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
421 	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
422 	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
423 	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
424 
425 	 /* This is an oddity of the fact we're using the same
426 	  * descriptions for registers in align_16 as align_1:
427 	  */
428 	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
429 	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
430 	 else
431 	    insn->bits3.da16.src1_vert_stride = reg.vstride;
432       }
433    }
434 }
435 
436 /**
437  * Set the Message Descriptor and Extended Message Descriptor fields
438  * for SEND messages.
439  *
440  * \note This zeroes out the Function Control bits, so it must be called
441  *       \b before filling out any message-specific data.  Callers can
442  *       choose not to fill in irrelevant bits; they will be zero.
443  */
444 static void
brw_set_message_descriptor(struct brw_compile * p,struct brw_instruction * inst,enum brw_message_target sfid,unsigned msg_length,unsigned response_length,bool header_present,bool end_of_thread)445 brw_set_message_descriptor(struct brw_compile *p,
446 			   struct brw_instruction *inst,
447 			   enum brw_message_target sfid,
448 			   unsigned msg_length,
449 			   unsigned response_length,
450 			   bool header_present,
451 			   bool end_of_thread)
452 {
453    struct intel_context *intel = &p->brw->intel;
454 
455    brw_set_src1(p, inst, brw_imm_d(0));
456 
457    if (intel->gen >= 5) {
458       inst->bits3.generic_gen5.header_present = header_present;
459       inst->bits3.generic_gen5.response_length = response_length;
460       inst->bits3.generic_gen5.msg_length = msg_length;
461       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
462 
463       if (intel->gen >= 6) {
464 	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
465 	 inst->header.destreg__conditionalmod = sfid;
466       } else {
467 	 /* Set Extended Message Descriptor (ex_desc) */
468 	 inst->bits2.send_gen5.sfid = sfid;
469 	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
470       }
471    } else {
472       inst->bits3.generic.response_length = response_length;
473       inst->bits3.generic.msg_length = msg_length;
474       inst->bits3.generic.msg_target = sfid;
475       inst->bits3.generic.end_of_thread = end_of_thread;
476    }
477 }
478 
brw_set_math_message(struct brw_compile * p,struct brw_instruction * insn,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)479 static void brw_set_math_message( struct brw_compile *p,
480 				  struct brw_instruction *insn,
481 				  unsigned function,
482 				  unsigned integer_type,
483 				  bool low_precision,
484 				  unsigned dataType )
485 {
486    struct brw_context *brw = p->brw;
487    struct intel_context *intel = &brw->intel;
488    unsigned msg_length;
489    unsigned response_length;
490 
491    /* Infer message length from the function */
492    switch (function) {
493    case BRW_MATH_FUNCTION_POW:
494    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
495    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       msg_length = 2;
498       break;
499    default:
500       msg_length = 1;
501       break;
502    }
503 
504    /* Infer response length from the function */
505    switch (function) {
506    case BRW_MATH_FUNCTION_SINCOS:
507    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
508       response_length = 2;
509       break;
510    default:
511       response_length = 1;
512       break;
513    }
514 
515 
516    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
517 			      msg_length, response_length, false, false);
518    if (intel->gen == 5) {
519       insn->bits3.math_gen5.function = function;
520       insn->bits3.math_gen5.int_type = integer_type;
521       insn->bits3.math_gen5.precision = low_precision;
522       insn->bits3.math_gen5.saturate = insn->header.saturate;
523       insn->bits3.math_gen5.data_type = dataType;
524       insn->bits3.math_gen5.snapshot = 0;
525    } else {
526       insn->bits3.math.function = function;
527       insn->bits3.math.int_type = integer_type;
528       insn->bits3.math.precision = low_precision;
529       insn->bits3.math.saturate = insn->header.saturate;
530       insn->bits3.math.data_type = dataType;
531    }
532    insn->header.saturate = 0;
533 }
534 
535 
brw_set_ff_sync_message(struct brw_compile * p,struct brw_instruction * insn,bool allocate,unsigned response_length,bool end_of_thread)536 static void brw_set_ff_sync_message(struct brw_compile *p,
537 				    struct brw_instruction *insn,
538 				    bool allocate,
539 				    unsigned response_length,
540 				    bool end_of_thread)
541 {
542    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
543 			      1, response_length, true, end_of_thread);
544    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
545    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
546    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
547    insn->bits3.urb_gen5.allocate = allocate;
548    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
549    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
550 }
551 
brw_set_urb_message(struct brw_compile * p,struct brw_instruction * insn,bool allocate,bool used,unsigned msg_length,unsigned response_length,bool end_of_thread,bool complete,unsigned offset,unsigned swizzle_control)552 static void brw_set_urb_message( struct brw_compile *p,
553 				 struct brw_instruction *insn,
554 				 bool allocate,
555 				 bool used,
556 				 unsigned msg_length,
557 				 unsigned response_length,
558 				 bool end_of_thread,
559 				 bool complete,
560 				 unsigned offset,
561 				 unsigned swizzle_control )
562 {
563    struct brw_context *brw = p->brw;
564    struct intel_context *intel = &brw->intel;
565 
566    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
567 			      msg_length, response_length, true, end_of_thread);
568    if (intel->gen == 7) {
569       insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
570       insn->bits3.urb_gen7.offset = offset;
571       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
572       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
573       /* per_slot_offset = 0 makes it ignore offsets in message header */
574       insn->bits3.urb_gen7.per_slot_offset = 0;
575       insn->bits3.urb_gen7.complete = complete;
576    } else if (intel->gen >= 5) {
577       insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
578       insn->bits3.urb_gen5.offset = offset;
579       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
580       insn->bits3.urb_gen5.allocate = allocate;
581       insn->bits3.urb_gen5.used = used;	/* ? */
582       insn->bits3.urb_gen5.complete = complete;
583    } else {
584       insn->bits3.urb.opcode = 0;	/* ? */
585       insn->bits3.urb.offset = offset;
586       insn->bits3.urb.swizzle_control = swizzle_control;
587       insn->bits3.urb.allocate = allocate;
588       insn->bits3.urb.used = used;	/* ? */
589       insn->bits3.urb.complete = complete;
590    }
591 }
592 
593 void
brw_set_dp_write_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned msg_length,bool header_present,unsigned last_render_target,unsigned response_length,unsigned end_of_thread,unsigned send_commit_msg)594 brw_set_dp_write_message(struct brw_compile *p,
595 			 struct brw_instruction *insn,
596 			 unsigned binding_table_index,
597 			 unsigned msg_control,
598 			 unsigned msg_type,
599 			 unsigned msg_length,
600 			 bool header_present,
601 			 unsigned last_render_target,
602 			 unsigned response_length,
603 			 unsigned end_of_thread,
604 			 unsigned send_commit_msg)
605 {
606    struct brw_context *brw = p->brw;
607    struct intel_context *intel = &brw->intel;
608    unsigned sfid;
609 
610    if (intel->gen >= 7) {
611       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
612       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
613 	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
614       else
615 	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
616    } else if (intel->gen == 6) {
617       /* Use the render cache for all write messages. */
618       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
619    } else {
620       sfid = BRW_SFID_DATAPORT_WRITE;
621    }
622 
623    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
624 			      header_present, end_of_thread);
625 
626    if (intel->gen >= 7) {
627       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
628       insn->bits3.gen7_dp.msg_control = msg_control |
629                                         last_render_target << 6;
630       insn->bits3.gen7_dp.msg_type = msg_type;
631    } else if (intel->gen == 6) {
632       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
633       insn->bits3.gen6_dp.msg_control = msg_control |
634                                         last_render_target << 5;
635       insn->bits3.gen6_dp.msg_type = msg_type;
636       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
637    } else if (intel->gen == 5) {
638       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
639       insn->bits3.dp_write_gen5.msg_control = msg_control;
640       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
641       insn->bits3.dp_write_gen5.msg_type = msg_type;
642       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
643    } else {
644       insn->bits3.dp_write.binding_table_index = binding_table_index;
645       insn->bits3.dp_write.msg_control = msg_control;
646       insn->bits3.dp_write.last_render_target = last_render_target;
647       insn->bits3.dp_write.msg_type = msg_type;
648       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
649    }
650 }
651 
652 void
brw_set_dp_read_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned msg_control,unsigned msg_type,unsigned target_cache,unsigned msg_length,bool header_present,unsigned response_length)653 brw_set_dp_read_message(struct brw_compile *p,
654 			struct brw_instruction *insn,
655 			unsigned binding_table_index,
656 			unsigned msg_control,
657 			unsigned msg_type,
658 			unsigned target_cache,
659 			unsigned msg_length,
660                         bool header_present,
661 			unsigned response_length)
662 {
663    struct brw_context *brw = p->brw;
664    struct intel_context *intel = &brw->intel;
665    unsigned sfid;
666 
667    if (intel->gen >= 7) {
668       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
669    } else if (intel->gen == 6) {
670       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
671 	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
672       else
673 	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
674    } else {
675       sfid = BRW_SFID_DATAPORT_READ;
676    }
677 
678    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
679 			      header_present, false);
680 
681    if (intel->gen >= 7) {
682       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
683       insn->bits3.gen7_dp.msg_control = msg_control;
684       insn->bits3.gen7_dp.msg_type = msg_type;
685    } else if (intel->gen == 6) {
686       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
687       insn->bits3.gen6_dp.msg_control = msg_control;
688       insn->bits3.gen6_dp.msg_type = msg_type;
689       insn->bits3.gen6_dp.send_commit_msg = 0;
690    } else if (intel->gen == 5) {
691       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
692       insn->bits3.dp_read_gen5.msg_control = msg_control;
693       insn->bits3.dp_read_gen5.msg_type = msg_type;
694       insn->bits3.dp_read_gen5.target_cache = target_cache;
695    } else if (intel->is_g4x) {
696       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
697       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
698       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
699       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
700    } else {
701       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
702       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
703       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
704       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
705    }
706 }
707 
708 void
brw_set_sampler_message(struct brw_compile * p,struct brw_instruction * insn,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)709 brw_set_sampler_message(struct brw_compile *p,
710                         struct brw_instruction *insn,
711                         unsigned binding_table_index,
712                         unsigned sampler,
713                         unsigned msg_type,
714                         unsigned response_length,
715                         unsigned msg_length,
716                         unsigned header_present,
717                         unsigned simd_mode,
718                         unsigned return_format)
719 {
720    struct brw_context *brw = p->brw;
721    struct intel_context *intel = &brw->intel;
722 
723    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
724 			      response_length, header_present, false);
725 
726    if (intel->gen >= 7) {
727       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
728       insn->bits3.sampler_gen7.sampler = sampler;
729       insn->bits3.sampler_gen7.msg_type = msg_type;
730       insn->bits3.sampler_gen7.simd_mode = simd_mode;
731    } else if (intel->gen >= 5) {
732       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
733       insn->bits3.sampler_gen5.sampler = sampler;
734       insn->bits3.sampler_gen5.msg_type = msg_type;
735       insn->bits3.sampler_gen5.simd_mode = simd_mode;
736    } else if (intel->is_g4x) {
737       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
738       insn->bits3.sampler_g4x.sampler = sampler;
739       insn->bits3.sampler_g4x.msg_type = msg_type;
740    } else {
741       insn->bits3.sampler.binding_table_index = binding_table_index;
742       insn->bits3.sampler.sampler = sampler;
743       insn->bits3.sampler.msg_type = msg_type;
744       insn->bits3.sampler.return_format = return_format;
745    }
746 }
747 
748 
749 #define next_insn brw_next_insn
750 struct brw_instruction *
brw_next_insn(struct brw_compile * p,unsigned opcode)751 brw_next_insn(struct brw_compile *p, unsigned opcode)
752 {
753    struct brw_instruction *insn;
754 
755    if (p->nr_insn + 1 > p->store_size) {
756       if (0)
757          printf("incresing the store size to %d\n", p->store_size << 1);
758       p->store_size <<= 1;
759       p->store = reralloc(p->mem_ctx, p->store,
760                           struct brw_instruction, p->store_size);
761       if (!p->store)
762          assert(!"realloc eu store memeory failed");
763    }
764 
765    p->next_insn_offset += 16;
766    insn = &p->store[p->nr_insn++];
767    memcpy(insn, p->current, sizeof(*insn));
768 
769    /* Reset this one-shot flag:
770     */
771 
772    if (p->current->header.destreg__conditionalmod) {
773       p->current->header.destreg__conditionalmod = 0;
774       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
775    }
776 
777    insn->header.opcode = opcode;
778    return insn;
779 }
780 
brw_alu1(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)781 static struct brw_instruction *brw_alu1( struct brw_compile *p,
782 					 unsigned opcode,
783 					 struct brw_reg dest,
784 					 struct brw_reg src )
785 {
786    struct brw_instruction *insn = next_insn(p, opcode);
787    brw_set_dest(p, insn, dest);
788    brw_set_src0(p, insn, src);
789    return insn;
790 }
791 
brw_alu2(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)792 static struct brw_instruction *brw_alu2(struct brw_compile *p,
793 					unsigned opcode,
794 					struct brw_reg dest,
795 					struct brw_reg src0,
796 					struct brw_reg src1 )
797 {
798    struct brw_instruction *insn = next_insn(p, opcode);
799    brw_set_dest(p, insn, dest);
800    brw_set_src0(p, insn, src0);
801    brw_set_src1(p, insn, src1);
802    return insn;
803 }
804 
805 static int
get_3src_subreg_nr(struct brw_reg reg)806 get_3src_subreg_nr(struct brw_reg reg)
807 {
808    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
809       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
810       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
811    } else {
812       return reg.subnr / 4;
813    }
814 }
815 
get_3src_type(int type)816 static int get_3src_type(int type)
817 {
818    assert(type == BRW_REGISTER_TYPE_F ||
819 	  type == BRW_REGISTER_TYPE_D ||
820 	  type == BRW_REGISTER_TYPE_UD);
821 
822    switch(type) {
823    case BRW_REGISTER_TYPE_F: return BRW_REGISTER_3SRC_TYPE_F;
824    case BRW_REGISTER_TYPE_D: return BRW_REGISTER_3SRC_TYPE_D;
825    case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_3SRC_TYPE_UD;
826    }
827 
828    return BRW_REGISTER_3SRC_TYPE_F;
829 }
830 
831 void
brw_set_3src_dest(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg dest)832 brw_set_3src_dest(struct brw_compile *p,
833 		  struct brw_instruction *insn,
834 		  struct brw_reg dest)
835 {
836    gen7_convert_mrf_to_grf(p, &dest);
837 
838    assert(insn->header.access_mode == BRW_ALIGN_16);
839 
840    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
841 	  dest.file == BRW_MESSAGE_REGISTER_FILE);
842    assert(dest.nr < 128);
843    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
844    insn->bits1.da3src.dest_reg_type = get_3src_type(dest.type);
845    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
846    insn->bits1.da3src.dest_reg_nr = dest.nr;
847    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
848    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
849    guess_execution_size(p, insn, dest);
850 }
851 
852 void
brw_set_3src_src0(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src0)853 brw_set_3src_src0(struct brw_compile *p,
854 		  struct brw_instruction *insn,
855 		  struct brw_reg src0)
856 {
857    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
858    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
859    assert(src0.nr < 128);
860    insn->bits1.da3src.src_reg_type = get_3src_type(src0.type);
861    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
862    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
863    insn->bits2.da3src.src0_reg_nr = src0.nr;
864    insn->bits1.da3src.src0_abs = src0.abs;
865    insn->bits1.da3src.src0_negate = src0.negate;
866    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
867 }
868 
869 void
brw_set_3src_src1(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src1)870 brw_set_3src_src1(struct brw_compile *p,
871 		  struct brw_instruction *insn,
872 		  struct brw_reg src1)
873 {
874    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
875    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
876    assert(src1.nr < 128);
877    assert(get_3src_type(src1.type) == insn->bits1.da3src.src_reg_type);
878    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
879    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
880    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
881    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
882    insn->bits3.da3src.src1_reg_nr = src1.nr;
883    insn->bits1.da3src.src1_abs = src1.abs;
884    insn->bits1.da3src.src1_negate = src1.negate;
885 }
886 
887 void
brw_set_3src_src2(struct brw_compile * p,struct brw_instruction * insn,struct brw_reg src2)888 brw_set_3src_src2(struct brw_compile *p,
889 		  struct brw_instruction *insn,
890 		  struct brw_reg src2)
891 {
892    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
893    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
894    assert(src2.nr < 128);
895    assert(get_3src_type(src2.type) == insn->bits1.da3src.src_reg_type);
896    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
897    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
898    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
899    insn->bits3.da3src.src2_reg_nr = src2.nr;
900    insn->bits1.da3src.src2_abs = src2.abs;
901    insn->bits1.da3src.src2_negate = src2.negate;
902 }
903 
brw_alu3(struct brw_compile * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)904 static struct brw_instruction *brw_alu3(struct brw_compile *p,
905 					unsigned opcode,
906 					struct brw_reg dest,
907 					struct brw_reg src0,
908 					struct brw_reg src1,
909 					struct brw_reg src2)
910 {
911    struct brw_instruction *insn = next_insn(p, opcode);
912    brw_set_3src_dest(p, insn, dest);
913    brw_set_3src_src0(p, insn, src0);
914    brw_set_3src_src1(p, insn, src1);
915    brw_set_3src_src2(p, insn, src2);
916    return insn;
917 }
918 
919 
920 /***********************************************************************
921  * Convenience routines.
922  */
923 #define ALU1(OP)					\
924 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
925 	      struct brw_reg dest,			\
926 	      struct brw_reg src0)   			\
927 {							\
928    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
929 }
930 
931 #define ALU2(OP)					\
932 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
933 	      struct brw_reg dest,			\
934 	      struct brw_reg src0,			\
935 	      struct brw_reg src1)   			\
936 {							\
937    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
938 }
939 
940 #define ALU3(OP)					\
941 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
942 	      struct brw_reg dest,			\
943 	      struct brw_reg src0,			\
944 	      struct brw_reg src1,			\
945 	      struct brw_reg src2)   			\
946 {							\
947    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
948 }
949 
950 /* Rounding operations (other than RNDD) require two instructions - the first
951  * stores a rounded value (possibly the wrong way) in the dest register, but
952  * also sets a per-channel "increment bit" in the flag register.  A predicated
953  * add of 1.0 fixes dest to contain the desired result.
954  *
955  * Sandybridge and later appear to round correctly without an ADD.
956  */
957 #define ROUND(OP)							      \
958 void brw_##OP(struct brw_compile *p,					      \
959 	      struct brw_reg dest,					      \
960 	      struct brw_reg src)					      \
961 {									      \
962    struct brw_instruction *rnd, *add;					      \
963    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
964    brw_set_dest(p, rnd, dest);						      \
965    brw_set_src0(p, rnd, src);						      \
966 									      \
967    if (p->brw->intel.gen < 6) {						      \
968       /* turn on round-increments */					      \
969       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
970       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
971       add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
972    }									      \
973 }
974 
975 
976 ALU1(MOV)
ALU2(SEL)977 ALU2(SEL)
978 ALU1(NOT)
979 ALU2(AND)
980 ALU2(OR)
981 ALU2(XOR)
982 ALU2(SHR)
983 ALU2(SHL)
984 ALU2(RSR)
985 ALU2(RSL)
986 ALU2(ASR)
987 ALU1(FRC)
988 ALU1(RNDD)
989 ALU2(MAC)
990 ALU2(MACH)
991 ALU1(LZD)
992 ALU2(DP4)
993 ALU2(DPH)
994 ALU2(DP3)
995 ALU2(DP2)
996 ALU2(LINE)
997 ALU2(PLN)
998 ALU3(MAD)
999 
1000 ROUND(RNDZ)
1001 ROUND(RNDE)
1002 
1003 
1004 struct brw_instruction *brw_ADD(struct brw_compile *p,
1005 				struct brw_reg dest,
1006 				struct brw_reg src0,
1007 				struct brw_reg src1)
1008 {
1009    /* 6.2.2: add */
1010    if (src0.type == BRW_REGISTER_TYPE_F ||
1011        (src0.file == BRW_IMMEDIATE_VALUE &&
1012 	src0.type == BRW_REGISTER_TYPE_VF)) {
1013       assert(src1.type != BRW_REGISTER_TYPE_UD);
1014       assert(src1.type != BRW_REGISTER_TYPE_D);
1015    }
1016 
1017    if (src1.type == BRW_REGISTER_TYPE_F ||
1018        (src1.file == BRW_IMMEDIATE_VALUE &&
1019 	src1.type == BRW_REGISTER_TYPE_VF)) {
1020       assert(src0.type != BRW_REGISTER_TYPE_UD);
1021       assert(src0.type != BRW_REGISTER_TYPE_D);
1022    }
1023 
1024    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1025 }
1026 
brw_AVG(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1027 struct brw_instruction *brw_AVG(struct brw_compile *p,
1028                                 struct brw_reg dest,
1029                                 struct brw_reg src0,
1030                                 struct brw_reg src1)
1031 {
1032    assert(dest.type == src0.type);
1033    assert(src0.type == src1.type);
1034    switch (src0.type) {
1035    case BRW_REGISTER_TYPE_B:
1036    case BRW_REGISTER_TYPE_UB:
1037    case BRW_REGISTER_TYPE_W:
1038    case BRW_REGISTER_TYPE_UW:
1039    case BRW_REGISTER_TYPE_D:
1040    case BRW_REGISTER_TYPE_UD:
1041       break;
1042    default:
1043       assert(!"Bad type for brw_AVG");
1044    }
1045 
1046    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1047 }
1048 
brw_MUL(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1049 struct brw_instruction *brw_MUL(struct brw_compile *p,
1050 				struct brw_reg dest,
1051 				struct brw_reg src0,
1052 				struct brw_reg src1)
1053 {
1054    /* 6.32.38: mul */
1055    if (src0.type == BRW_REGISTER_TYPE_D ||
1056        src0.type == BRW_REGISTER_TYPE_UD ||
1057        src1.type == BRW_REGISTER_TYPE_D ||
1058        src1.type == BRW_REGISTER_TYPE_UD) {
1059       assert(dest.type != BRW_REGISTER_TYPE_F);
1060    }
1061 
1062    if (src0.type == BRW_REGISTER_TYPE_F ||
1063        (src0.file == BRW_IMMEDIATE_VALUE &&
1064 	src0.type == BRW_REGISTER_TYPE_VF)) {
1065       assert(src1.type != BRW_REGISTER_TYPE_UD);
1066       assert(src1.type != BRW_REGISTER_TYPE_D);
1067    }
1068 
1069    if (src1.type == BRW_REGISTER_TYPE_F ||
1070        (src1.file == BRW_IMMEDIATE_VALUE &&
1071 	src1.type == BRW_REGISTER_TYPE_VF)) {
1072       assert(src0.type != BRW_REGISTER_TYPE_UD);
1073       assert(src0.type != BRW_REGISTER_TYPE_D);
1074    }
1075 
1076    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1077 	  src0.nr != BRW_ARF_ACCUMULATOR);
1078    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1079 	  src1.nr != BRW_ARF_ACCUMULATOR);
1080 
1081    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1082 }
1083 
1084 
brw_NOP(struct brw_compile * p)1085 void brw_NOP(struct brw_compile *p)
1086 {
1087    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1088    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1089    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1090    brw_set_src1(p, insn, brw_imm_ud(0x0));
1091 }
1092 
1093 
1094 
1095 
1096 
1097 /***********************************************************************
1098  * Comparisons, if/else/endif
1099  */
1100 
brw_JMPI(struct brw_compile * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1101 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1102                                  struct brw_reg dest,
1103                                  struct brw_reg src0,
1104                                  struct brw_reg src1)
1105 {
1106    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1107 
1108    insn->header.execution_size = 1;
1109    insn->header.compression_control = BRW_COMPRESSION_NONE;
1110    insn->header.mask_control = BRW_MASK_DISABLE;
1111 
1112    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1113 
1114    return insn;
1115 }
1116 
1117 static void
push_if_stack(struct brw_compile * p,struct brw_instruction * inst)1118 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1119 {
1120    p->if_stack[p->if_stack_depth] = inst - p->store;
1121 
1122    p->if_stack_depth++;
1123    if (p->if_stack_array_size <= p->if_stack_depth) {
1124       p->if_stack_array_size *= 2;
1125       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1126 			     p->if_stack_array_size);
1127    }
1128 }
1129 
1130 static struct brw_instruction *
pop_if_stack(struct brw_compile * p)1131 pop_if_stack(struct brw_compile *p)
1132 {
1133    p->if_stack_depth--;
1134    return &p->store[p->if_stack[p->if_stack_depth]];
1135 }
1136 
1137 static void
push_loop_stack(struct brw_compile * p,struct brw_instruction * inst)1138 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1139 {
1140    if (p->loop_stack_array_size < p->loop_stack_depth) {
1141       p->loop_stack_array_size *= 2;
1142       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1143 			       p->loop_stack_array_size);
1144       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1145 				     p->loop_stack_array_size);
1146    }
1147 
1148    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1149    p->loop_stack_depth++;
1150    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1151 }
1152 
1153 static struct brw_instruction *
get_inner_do_insn(struct brw_compile * p)1154 get_inner_do_insn(struct brw_compile *p)
1155 {
1156    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1157 }
1158 
1159 /* EU takes the value from the flag register and pushes it onto some
1160  * sort of a stack (presumably merging with any flag value already on
1161  * the stack).  Within an if block, the flags at the top of the stack
1162  * control execution on each channel of the unit, eg. on each of the
1163  * 16 pixel values in our wm programs.
1164  *
1165  * When the matching 'else' instruction is reached (presumably by
1166  * countdown of the instruction count patched in by our ELSE/ENDIF
1167  * functions), the relevent flags are inverted.
1168  *
1169  * When the matching 'endif' instruction is reached, the flags are
1170  * popped off.  If the stack is now empty, normal execution resumes.
1171  */
1172 struct brw_instruction *
brw_IF(struct brw_compile * p,unsigned execute_size)1173 brw_IF(struct brw_compile *p, unsigned execute_size)
1174 {
1175    struct intel_context *intel = &p->brw->intel;
1176    struct brw_instruction *insn;
1177 
1178    insn = next_insn(p, BRW_OPCODE_IF);
1179 
1180    /* Override the defaults for this instruction:
1181     */
1182    if (intel->gen < 6) {
1183       brw_set_dest(p, insn, brw_ip_reg());
1184       brw_set_src0(p, insn, brw_ip_reg());
1185       brw_set_src1(p, insn, brw_imm_d(0x0));
1186    } else if (intel->gen == 6) {
1187       brw_set_dest(p, insn, brw_imm_w(0));
1188       insn->bits1.branch_gen6.jump_count = 0;
1189       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1190       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1191    } else {
1192       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1193       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1194       brw_set_src1(p, insn, brw_imm_ud(0));
1195       insn->bits3.break_cont.jip = 0;
1196       insn->bits3.break_cont.uip = 0;
1197    }
1198 
1199    insn->header.execution_size = execute_size;
1200    insn->header.compression_control = BRW_COMPRESSION_NONE;
1201    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1202    insn->header.mask_control = BRW_MASK_ENABLE;
1203    if (!p->single_program_flow)
1204       insn->header.thread_control = BRW_THREAD_SWITCH;
1205 
1206    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1207 
1208    push_if_stack(p, insn);
1209    p->if_depth_in_loop[p->loop_stack_depth]++;
1210    return insn;
1211 }
1212 
1213 /* This function is only used for gen6-style IF instructions with an
1214  * embedded comparison (conditional modifier).  It is not used on gen7.
1215  */
1216 struct brw_instruction *
gen6_IF(struct brw_compile * p,uint32_t conditional,struct brw_reg src0,struct brw_reg src1)1217 gen6_IF(struct brw_compile *p, uint32_t conditional,
1218 	struct brw_reg src0, struct brw_reg src1)
1219 {
1220    struct brw_instruction *insn;
1221 
1222    insn = next_insn(p, BRW_OPCODE_IF);
1223 
1224    brw_set_dest(p, insn, brw_imm_w(0));
1225    if (p->compressed) {
1226       insn->header.execution_size = BRW_EXECUTE_16;
1227    } else {
1228       insn->header.execution_size = BRW_EXECUTE_8;
1229    }
1230    insn->bits1.branch_gen6.jump_count = 0;
1231    brw_set_src0(p, insn, src0);
1232    brw_set_src1(p, insn, src1);
1233 
1234    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1235    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1236    insn->header.destreg__conditionalmod = conditional;
1237 
1238    if (!p->single_program_flow)
1239       insn->header.thread_control = BRW_THREAD_SWITCH;
1240 
1241    push_if_stack(p, insn);
1242    return insn;
1243 }
1244 
1245 /**
1246  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1247  */
1248 static void
convert_IF_ELSE_to_ADD(struct brw_compile * p,struct brw_instruction * if_inst,struct brw_instruction * else_inst)1249 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1250 		       struct brw_instruction *if_inst,
1251 		       struct brw_instruction *else_inst)
1252 {
1253    /* The next instruction (where the ENDIF would be, if it existed) */
1254    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1255 
1256    assert(p->single_program_flow);
1257    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1258    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1259    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1260 
1261    /* Convert IF to an ADD instruction that moves the instruction pointer
1262     * to the first instruction of the ELSE block.  If there is no ELSE
1263     * block, point to where ENDIF would be.  Reverse the predicate.
1264     *
1265     * There's no need to execute an ENDIF since we don't need to do any
1266     * stack operations, and if we're currently executing, we just want to
1267     * continue normally.
1268     */
1269    if_inst->header.opcode = BRW_OPCODE_ADD;
1270    if_inst->header.predicate_inverse = 1;
1271 
1272    if (else_inst != NULL) {
1273       /* Convert ELSE to an ADD instruction that points where the ENDIF
1274        * would be.
1275        */
1276       else_inst->header.opcode = BRW_OPCODE_ADD;
1277 
1278       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1279       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1280    } else {
1281       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1282    }
1283 }
1284 
1285 /**
1286  * Patch IF and ELSE instructions with appropriate jump targets.
1287  */
1288 static void
patch_IF_ELSE(struct brw_compile * p,struct brw_instruction * if_inst,struct brw_instruction * else_inst,struct brw_instruction * endif_inst)1289 patch_IF_ELSE(struct brw_compile *p,
1290 	      struct brw_instruction *if_inst,
1291 	      struct brw_instruction *else_inst,
1292 	      struct brw_instruction *endif_inst)
1293 {
1294    struct intel_context *intel = &p->brw->intel;
1295 
1296    /* We shouldn't be patching IF and ELSE instructions in single program flow
1297     * mode when gen < 6, because in single program flow mode on those
1298     * platforms, we convert flow control instructions to conditional ADDs that
1299     * operate on IP (see brw_ENDIF).
1300     *
1301     * However, on Gen6, writing to IP doesn't work in single program flow mode
1302     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1303     * not be updated by non-flow control instructions.").  And on later
1304     * platforms, there is no significant benefit to converting control flow
1305     * instructions to conditional ADDs.  So we do patch IF and ELSE
1306     * instructions in single program flow mode on those platforms.
1307     */
1308    if (intel->gen < 6)
1309       assert(!p->single_program_flow);
1310 
1311    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1312    assert(endif_inst != NULL);
1313    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1314 
1315    unsigned br = 1;
1316    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1317     * requires 2 chunks.
1318     */
1319    if (intel->gen >= 5)
1320       br = 2;
1321 
1322    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1323    endif_inst->header.execution_size = if_inst->header.execution_size;
1324 
1325    if (else_inst == NULL) {
1326       /* Patch IF -> ENDIF */
1327       if (intel->gen < 6) {
1328 	 /* Turn it into an IFF, which means no mask stack operations for
1329 	  * all-false and jumping past the ENDIF.
1330 	  */
1331 	 if_inst->header.opcode = BRW_OPCODE_IFF;
1332 	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1333 	 if_inst->bits3.if_else.pop_count = 0;
1334 	 if_inst->bits3.if_else.pad0 = 0;
1335       } else if (intel->gen == 6) {
1336 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1337 	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1338       } else {
1339 	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1340 	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1341       }
1342    } else {
1343       else_inst->header.execution_size = if_inst->header.execution_size;
1344 
1345       /* Patch IF -> ELSE */
1346       if (intel->gen < 6) {
1347 	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1348 	 if_inst->bits3.if_else.pop_count = 0;
1349 	 if_inst->bits3.if_else.pad0 = 0;
1350       } else if (intel->gen == 6) {
1351 	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1352       }
1353 
1354       /* Patch ELSE -> ENDIF */
1355       if (intel->gen < 6) {
1356 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1357 	  * matching ENDIF.
1358 	  */
1359 	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1360 	 else_inst->bits3.if_else.pop_count = 1;
1361 	 else_inst->bits3.if_else.pad0 = 0;
1362       } else if (intel->gen == 6) {
1363 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1364 	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1365       } else {
1366 	 /* The IF instruction's JIP should point just past the ELSE */
1367 	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1368 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1369 	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1370 	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1371       }
1372    }
1373 }
1374 
1375 void
brw_ELSE(struct brw_compile * p)1376 brw_ELSE(struct brw_compile *p)
1377 {
1378    struct intel_context *intel = &p->brw->intel;
1379    struct brw_instruction *insn;
1380 
1381    insn = next_insn(p, BRW_OPCODE_ELSE);
1382 
1383    if (intel->gen < 6) {
1384       brw_set_dest(p, insn, brw_ip_reg());
1385       brw_set_src0(p, insn, brw_ip_reg());
1386       brw_set_src1(p, insn, brw_imm_d(0x0));
1387    } else if (intel->gen == 6) {
1388       brw_set_dest(p, insn, brw_imm_w(0));
1389       insn->bits1.branch_gen6.jump_count = 0;
1390       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1391       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392    } else {
1393       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1394       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395       brw_set_src1(p, insn, brw_imm_ud(0));
1396       insn->bits3.break_cont.jip = 0;
1397       insn->bits3.break_cont.uip = 0;
1398    }
1399 
1400    insn->header.compression_control = BRW_COMPRESSION_NONE;
1401    insn->header.mask_control = BRW_MASK_ENABLE;
1402    if (!p->single_program_flow)
1403       insn->header.thread_control = BRW_THREAD_SWITCH;
1404 
1405    push_if_stack(p, insn);
1406 }
1407 
1408 void
brw_ENDIF(struct brw_compile * p)1409 brw_ENDIF(struct brw_compile *p)
1410 {
1411    struct intel_context *intel = &p->brw->intel;
1412    struct brw_instruction *insn = NULL;
1413    struct brw_instruction *else_inst = NULL;
1414    struct brw_instruction *if_inst = NULL;
1415    struct brw_instruction *tmp;
1416    bool emit_endif = true;
1417 
1418    /* In single program flow mode, we can express IF and ELSE instructions
1419     * equivalently as ADD instructions that operate on IP.  On platforms prior
1420     * to Gen6, flow control instructions cause an implied thread switch, so
1421     * this is a significant savings.
1422     *
1423     * However, on Gen6, writing to IP doesn't work in single program flow mode
1424     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1425     * not be updated by non-flow control instructions.").  And on later
1426     * platforms, there is no significant benefit to converting control flow
1427     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1428     * Gen5.
1429     */
1430    if (intel->gen < 6 && p->single_program_flow)
1431       emit_endif = false;
1432 
1433    /*
1434     * A single next_insn() may change the base adress of instruction store
1435     * memory(p->store), so call it first before referencing the instruction
1436     * store pointer from an index
1437     */
1438    if (emit_endif)
1439       insn = next_insn(p, BRW_OPCODE_ENDIF);
1440 
1441    /* Pop the IF and (optional) ELSE instructions from the stack */
1442    p->if_depth_in_loop[p->loop_stack_depth]--;
1443    tmp = pop_if_stack(p);
1444    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1445       else_inst = tmp;
1446       tmp = pop_if_stack(p);
1447    }
1448    if_inst = tmp;
1449 
1450    if (!emit_endif) {
1451       /* ENDIF is useless; don't bother emitting it. */
1452       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1453       return;
1454    }
1455 
1456    if (intel->gen < 6) {
1457       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1458       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1459       brw_set_src1(p, insn, brw_imm_d(0x0));
1460    } else if (intel->gen == 6) {
1461       brw_set_dest(p, insn, brw_imm_w(0));
1462       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1464    } else {
1465       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467       brw_set_src1(p, insn, brw_imm_ud(0));
1468    }
1469 
1470    insn->header.compression_control = BRW_COMPRESSION_NONE;
1471    insn->header.mask_control = BRW_MASK_ENABLE;
1472    insn->header.thread_control = BRW_THREAD_SWITCH;
1473 
1474    /* Also pop item off the stack in the endif instruction: */
1475    if (intel->gen < 6) {
1476       insn->bits3.if_else.jump_count = 0;
1477       insn->bits3.if_else.pop_count = 1;
1478       insn->bits3.if_else.pad0 = 0;
1479    } else if (intel->gen == 6) {
1480       insn->bits1.branch_gen6.jump_count = 2;
1481    } else {
1482       insn->bits3.break_cont.jip = 2;
1483    }
1484    patch_IF_ELSE(p, if_inst, else_inst, insn);
1485 }
1486 
brw_BREAK(struct brw_compile * p)1487 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1488 {
1489    struct intel_context *intel = &p->brw->intel;
1490    struct brw_instruction *insn;
1491 
1492    insn = next_insn(p, BRW_OPCODE_BREAK);
1493    if (intel->gen >= 6) {
1494       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1495       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1496       brw_set_src1(p, insn, brw_imm_d(0x0));
1497    } else {
1498       brw_set_dest(p, insn, brw_ip_reg());
1499       brw_set_src0(p, insn, brw_ip_reg());
1500       brw_set_src1(p, insn, brw_imm_d(0x0));
1501       insn->bits3.if_else.pad0 = 0;
1502       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503    }
1504    insn->header.compression_control = BRW_COMPRESSION_NONE;
1505    insn->header.execution_size = BRW_EXECUTE_8;
1506 
1507    return insn;
1508 }
1509 
gen6_CONT(struct brw_compile * p)1510 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1511 {
1512    struct brw_instruction *insn;
1513 
1514    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1515    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1516    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517    brw_set_dest(p, insn, brw_ip_reg());
1518    brw_set_src0(p, insn, brw_ip_reg());
1519    brw_set_src1(p, insn, brw_imm_d(0x0));
1520 
1521    insn->header.compression_control = BRW_COMPRESSION_NONE;
1522    insn->header.execution_size = BRW_EXECUTE_8;
1523    return insn;
1524 }
1525 
brw_CONT(struct brw_compile * p)1526 struct brw_instruction *brw_CONT(struct brw_compile *p)
1527 {
1528    struct brw_instruction *insn;
1529    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1530    brw_set_dest(p, insn, brw_ip_reg());
1531    brw_set_src0(p, insn, brw_ip_reg());
1532    brw_set_src1(p, insn, brw_imm_d(0x0));
1533    insn->header.compression_control = BRW_COMPRESSION_NONE;
1534    insn->header.execution_size = BRW_EXECUTE_8;
1535    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1536    insn->bits3.if_else.pad0 = 0;
1537    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1538    return insn;
1539 }
1540 
gen6_HALT(struct brw_compile * p)1541 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1542 {
1543    struct brw_instruction *insn;
1544 
1545    insn = next_insn(p, BRW_OPCODE_HALT);
1546    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1549 
1550    if (p->compressed) {
1551       insn->header.execution_size = BRW_EXECUTE_16;
1552    } else {
1553       insn->header.compression_control = BRW_COMPRESSION_NONE;
1554       insn->header.execution_size = BRW_EXECUTE_8;
1555    }
1556    return insn;
1557 }
1558 
1559 /* DO/WHILE loop:
1560  *
1561  * The DO/WHILE is just an unterminated loop -- break or continue are
1562  * used for control within the loop.  We have a few ways they can be
1563  * done.
1564  *
1565  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1566  * jip and no DO instruction.
1567  *
1568  * For non-uniform control flow pre-gen6, there's a DO instruction to
1569  * push the mask, and a WHILE to jump back, and BREAK to get out and
1570  * pop the mask.
1571  *
1572  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1573  * just points back to the first instruction of the loop.
1574  */
brw_DO(struct brw_compile * p,unsigned execute_size)1575 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1576 {
1577    struct intel_context *intel = &p->brw->intel;
1578 
1579    if (intel->gen >= 6 || p->single_program_flow) {
1580       push_loop_stack(p, &p->store[p->nr_insn]);
1581       return &p->store[p->nr_insn];
1582    } else {
1583       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1584 
1585       push_loop_stack(p, insn);
1586 
1587       /* Override the defaults for this instruction:
1588        */
1589       brw_set_dest(p, insn, brw_null_reg());
1590       brw_set_src0(p, insn, brw_null_reg());
1591       brw_set_src1(p, insn, brw_null_reg());
1592 
1593       insn->header.compression_control = BRW_COMPRESSION_NONE;
1594       insn->header.execution_size = execute_size;
1595       insn->header.predicate_control = BRW_PREDICATE_NONE;
1596       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1597       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1598 
1599       return insn;
1600    }
1601 }
1602 
1603 /**
1604  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1605  * instruction here.
1606  *
1607  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1608  * nesting, since it can always just point to the end of the block/current loop.
1609  */
1610 static void
brw_patch_break_cont(struct brw_compile * p,struct brw_instruction * while_inst)1611 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1612 {
1613    struct intel_context *intel = &p->brw->intel;
1614    struct brw_instruction *do_inst = get_inner_do_insn(p);
1615    struct brw_instruction *inst;
1616    int br = (intel->gen == 5) ? 2 : 1;
1617 
1618    for (inst = while_inst - 1; inst != do_inst; inst--) {
1619       /* If the jump count is != 0, that means that this instruction has already
1620        * been patched because it's part of a loop inside of the one we're
1621        * patching.
1622        */
1623       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1624 	  inst->bits3.if_else.jump_count == 0) {
1625 	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1626       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1627 		 inst->bits3.if_else.jump_count == 0) {
1628 	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1629       }
1630    }
1631 }
1632 
brw_WHILE(struct brw_compile * p)1633 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1634 {
1635    struct intel_context *intel = &p->brw->intel;
1636    struct brw_instruction *insn, *do_insn;
1637    unsigned br = 1;
1638 
1639    if (intel->gen >= 5)
1640       br = 2;
1641 
1642    if (intel->gen >= 7) {
1643       insn = next_insn(p, BRW_OPCODE_WHILE);
1644       do_insn = get_inner_do_insn(p);
1645 
1646       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648       brw_set_src1(p, insn, brw_imm_ud(0));
1649       insn->bits3.break_cont.jip = br * (do_insn - insn);
1650 
1651       insn->header.execution_size = BRW_EXECUTE_8;
1652    } else if (intel->gen == 6) {
1653       insn = next_insn(p, BRW_OPCODE_WHILE);
1654       do_insn = get_inner_do_insn(p);
1655 
1656       brw_set_dest(p, insn, brw_imm_w(0));
1657       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1658       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660 
1661       insn->header.execution_size = BRW_EXECUTE_8;
1662    } else {
1663       if (p->single_program_flow) {
1664 	 insn = next_insn(p, BRW_OPCODE_ADD);
1665          do_insn = get_inner_do_insn(p);
1666 
1667 	 brw_set_dest(p, insn, brw_ip_reg());
1668 	 brw_set_src0(p, insn, brw_ip_reg());
1669 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1670 	 insn->header.execution_size = BRW_EXECUTE_1;
1671       } else {
1672 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1673          do_insn = get_inner_do_insn(p);
1674 
1675 	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1676 
1677 	 brw_set_dest(p, insn, brw_ip_reg());
1678 	 brw_set_src0(p, insn, brw_ip_reg());
1679 	 brw_set_src1(p, insn, brw_imm_d(0));
1680 
1681 	 insn->header.execution_size = do_insn->header.execution_size;
1682 	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1683 	 insn->bits3.if_else.pop_count = 0;
1684 	 insn->bits3.if_else.pad0 = 0;
1685 
1686 	 brw_patch_break_cont(p, insn);
1687       }
1688    }
1689    insn->header.compression_control = BRW_COMPRESSION_NONE;
1690    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1691 
1692    p->loop_stack_depth--;
1693 
1694    return insn;
1695 }
1696 
1697 
1698 /* FORWARD JUMPS:
1699  */
brw_land_fwd_jump(struct brw_compile * p,int jmp_insn_idx)1700 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1701 {
1702    struct intel_context *intel = &p->brw->intel;
1703    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1704    unsigned jmpi = 1;
1705 
1706    if (intel->gen >= 5)
1707       jmpi = 2;
1708 
1709    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1710    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1711 
1712    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1713 }
1714 
1715 
1716 
1717 /* To integrate with the above, it makes sense that the comparison
1718  * instruction should populate the flag register.  It might be simpler
1719  * just to use the flag reg for most WM tasks?
1720  */
brw_CMP(struct brw_compile * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1721 void brw_CMP(struct brw_compile *p,
1722 	     struct brw_reg dest,
1723 	     unsigned conditional,
1724 	     struct brw_reg src0,
1725 	     struct brw_reg src1)
1726 {
1727    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1728 
1729    insn->header.destreg__conditionalmod = conditional;
1730    brw_set_dest(p, insn, dest);
1731    brw_set_src0(p, insn, src0);
1732    brw_set_src1(p, insn, src1);
1733 
1734 /*    guess_execution_size(insn, src0); */
1735 
1736 
1737    /* Make it so that future instructions will use the computed flag
1738     * value until brw_set_predicate_control_flag_value() is called
1739     * again.
1740     */
1741    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1742        dest.nr == 0) {
1743       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1744       p->flag_value = 0xff;
1745    }
1746 }
1747 
1748 /* Issue 'wait' instruction for n1, host could program MMIO
1749    to wake up thread. */
brw_WAIT(struct brw_compile * p)1750 void brw_WAIT (struct brw_compile *p)
1751 {
1752    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1753    struct brw_reg src = brw_notification_1_reg();
1754 
1755    brw_set_dest(p, insn, src);
1756    brw_set_src0(p, insn, src);
1757    brw_set_src1(p, insn, brw_null_reg());
1758    insn->header.execution_size = 0; /* must */
1759    insn->header.predicate_control = 0;
1760    insn->header.compression_control = 0;
1761 }
1762 
1763 
1764 /***********************************************************************
1765  * Helpers for the various SEND message types:
1766  */
1767 
1768 /** Extended math function, float[8].
1769  */
brw_math(struct brw_compile * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned data_type,unsigned precision)1770 void brw_math( struct brw_compile *p,
1771 	       struct brw_reg dest,
1772 	       unsigned function,
1773 	       unsigned msg_reg_nr,
1774 	       struct brw_reg src,
1775 	       unsigned data_type,
1776 	       unsigned precision )
1777 {
1778    struct intel_context *intel = &p->brw->intel;
1779 
1780    if (intel->gen >= 6) {
1781       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1782 
1783       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1784       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1785 
1786       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1787       if (intel->gen == 6)
1788 	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1789 
1790       /* Source modifiers are ignored for extended math instructions on Gen6. */
1791       if (intel->gen == 6) {
1792 	 assert(!src.negate);
1793 	 assert(!src.abs);
1794       }
1795 
1796       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1797 	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1798 	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1799 	 assert(src.type != BRW_REGISTER_TYPE_F);
1800       } else {
1801 	 assert(src.type == BRW_REGISTER_TYPE_F);
1802       }
1803 
1804       /* Math is the same ISA format as other opcodes, except that CondModifier
1805        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1806        */
1807       insn->header.destreg__conditionalmod = function;
1808 
1809       brw_set_dest(p, insn, dest);
1810       brw_set_src0(p, insn, src);
1811       brw_set_src1(p, insn, brw_null_reg());
1812    } else {
1813       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1814 
1815       /* Example code doesn't set predicate_control for send
1816        * instructions.
1817        */
1818       insn->header.predicate_control = 0;
1819       insn->header.destreg__conditionalmod = msg_reg_nr;
1820 
1821       brw_set_dest(p, insn, dest);
1822       brw_set_src0(p, insn, src);
1823       brw_set_math_message(p,
1824 			   insn,
1825 			   function,
1826 			   src.type == BRW_REGISTER_TYPE_D,
1827 			   precision,
1828 			   data_type);
1829    }
1830 }
1831 
1832 /** Extended math function, float[8].
1833  */
brw_math2(struct brw_compile * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)1834 void brw_math2(struct brw_compile *p,
1835 	       struct brw_reg dest,
1836 	       unsigned function,
1837 	       struct brw_reg src0,
1838 	       struct brw_reg src1)
1839 {
1840    struct intel_context *intel = &p->brw->intel;
1841    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1842 
1843    assert(intel->gen >= 6);
1844    (void) intel;
1845 
1846 
1847    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1848    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1849    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1850 
1851    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1852    if (intel->gen == 6) {
1853       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1854       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1855    }
1856 
1857    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1858        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1859        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1860       assert(src0.type != BRW_REGISTER_TYPE_F);
1861       assert(src1.type != BRW_REGISTER_TYPE_F);
1862    } else {
1863       assert(src0.type == BRW_REGISTER_TYPE_F);
1864       assert(src1.type == BRW_REGISTER_TYPE_F);
1865    }
1866 
1867    /* Source modifiers are ignored for extended math instructions on Gen6. */
1868    if (intel->gen == 6) {
1869       assert(!src0.negate);
1870       assert(!src0.abs);
1871       assert(!src1.negate);
1872       assert(!src1.abs);
1873    }
1874 
1875    /* Math is the same ISA format as other opcodes, except that CondModifier
1876     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1877     */
1878    insn->header.destreg__conditionalmod = function;
1879 
1880    brw_set_dest(p, insn, dest);
1881    brw_set_src0(p, insn, src0);
1882    brw_set_src1(p, insn, src1);
1883 }
1884 
1885 
1886 /**
1887  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1888  * using a constant offset per channel.
1889  *
1890  * The offset must be aligned to oword size (16 bytes).  Used for
1891  * register spilling.
1892  */
brw_oword_block_write_scratch(struct brw_compile * p,struct brw_reg mrf,int num_regs,unsigned offset)1893 void brw_oword_block_write_scratch(struct brw_compile *p,
1894 				   struct brw_reg mrf,
1895 				   int num_regs,
1896 				   unsigned offset)
1897 {
1898    struct intel_context *intel = &p->brw->intel;
1899    uint32_t msg_control, msg_type;
1900    int mlen;
1901 
1902    if (intel->gen >= 6)
1903       offset /= 16;
1904 
1905    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1906 
1907    if (num_regs == 1) {
1908       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1909       mlen = 2;
1910    } else {
1911       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1912       mlen = 3;
1913    }
1914 
1915    /* Set up the message header.  This is g0, with g0.2 filled with
1916     * the offset.  We don't want to leave our offset around in g0 or
1917     * it'll screw up texture samples, so set it up inside the message
1918     * reg.
1919     */
1920    {
1921       brw_push_insn_state(p);
1922       brw_set_mask_control(p, BRW_MASK_DISABLE);
1923       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1924 
1925       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1926 
1927       /* set message header global offset field (reg 0, element 2) */
1928       brw_MOV(p,
1929 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1930 				  mrf.nr,
1931 				  2), BRW_REGISTER_TYPE_UD),
1932 	      brw_imm_ud(offset));
1933 
1934       brw_pop_insn_state(p);
1935    }
1936 
1937    {
1938       struct brw_reg dest;
1939       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1940       int send_commit_msg;
1941       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1942 					 BRW_REGISTER_TYPE_UW);
1943 
1944       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1945 	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1946 	 src_header = vec16(src_header);
1947       }
1948       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1949       insn->header.destreg__conditionalmod = mrf.nr;
1950 
1951       /* Until gen6, writes followed by reads from the same location
1952        * are not guaranteed to be ordered unless write_commit is set.
1953        * If set, then a no-op write is issued to the destination
1954        * register to set a dependency, and a read from the destination
1955        * can be used to ensure the ordering.
1956        *
1957        * For gen6, only writes between different threads need ordering
1958        * protection.  Our use of DP writes is all about register
1959        * spilling within a thread.
1960        */
1961       if (intel->gen >= 6) {
1962 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1963 	 send_commit_msg = 0;
1964       } else {
1965 	 dest = src_header;
1966 	 send_commit_msg = 1;
1967       }
1968 
1969       brw_set_dest(p, insn, dest);
1970       if (intel->gen >= 6) {
1971 	 brw_set_src0(p, insn, mrf);
1972       } else {
1973 	 brw_set_src0(p, insn, brw_null_reg());
1974       }
1975 
1976       if (intel->gen >= 6)
1977 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1978       else
1979 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1980 
1981       brw_set_dp_write_message(p,
1982 			       insn,
1983 			       255, /* binding table index (255=stateless) */
1984 			       msg_control,
1985 			       msg_type,
1986 			       mlen,
1987 			       true, /* header_present */
1988 			       0, /* not a render target */
1989 			       send_commit_msg, /* response_length */
1990 			       0, /* eot */
1991 			       send_commit_msg);
1992    }
1993 }
1994 
1995 
1996 /**
1997  * Read a block of owords (half a GRF each) from the scratch buffer
1998  * using a constant index per channel.
1999  *
2000  * Offset must be aligned to oword size (16 bytes).  Used for register
2001  * spilling.
2002  */
2003 void
brw_oword_block_read_scratch(struct brw_compile * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2004 brw_oword_block_read_scratch(struct brw_compile *p,
2005 			     struct brw_reg dest,
2006 			     struct brw_reg mrf,
2007 			     int num_regs,
2008 			     unsigned offset)
2009 {
2010    struct intel_context *intel = &p->brw->intel;
2011    uint32_t msg_control;
2012    int rlen;
2013 
2014    if (intel->gen >= 6)
2015       offset /= 16;
2016 
2017    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2018    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2019 
2020    if (num_regs == 1) {
2021       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2022       rlen = 1;
2023    } else {
2024       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2025       rlen = 2;
2026    }
2027 
2028    {
2029       brw_push_insn_state(p);
2030       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2031       brw_set_mask_control(p, BRW_MASK_DISABLE);
2032 
2033       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2034 
2035       /* set message header global offset field (reg 0, element 2) */
2036       brw_MOV(p,
2037 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2038 				  mrf.nr,
2039 				  2), BRW_REGISTER_TYPE_UD),
2040 	      brw_imm_ud(offset));
2041 
2042       brw_pop_insn_state(p);
2043    }
2044 
2045    {
2046       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2047 
2048       assert(insn->header.predicate_control == 0);
2049       insn->header.compression_control = BRW_COMPRESSION_NONE;
2050       insn->header.destreg__conditionalmod = mrf.nr;
2051 
2052       brw_set_dest(p, insn, dest);	/* UW? */
2053       if (intel->gen >= 6) {
2054 	 brw_set_src0(p, insn, mrf);
2055       } else {
2056 	 brw_set_src0(p, insn, brw_null_reg());
2057       }
2058 
2059       brw_set_dp_read_message(p,
2060 			      insn,
2061 			      255, /* binding table index (255=stateless) */
2062 			      msg_control,
2063 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2064 			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2065 			      1, /* msg_length */
2066                               true, /* header_present */
2067 			      rlen);
2068    }
2069 }
2070 
2071 /**
2072  * Read a float[4] vector from the data port Data Cache (const buffer).
2073  * Location (in buffer) should be a multiple of 16.
2074  * Used for fetching shader constants.
2075  */
brw_oword_block_read(struct brw_compile * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2076 void brw_oword_block_read(struct brw_compile *p,
2077 			  struct brw_reg dest,
2078 			  struct brw_reg mrf,
2079 			  uint32_t offset,
2080 			  uint32_t bind_table_index)
2081 {
2082    struct intel_context *intel = &p->brw->intel;
2083 
2084    /* On newer hardware, offset is in units of owords. */
2085    if (intel->gen >= 6)
2086       offset /= 16;
2087 
2088    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2089 
2090    brw_push_insn_state(p);
2091    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2092    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2093    brw_set_mask_control(p, BRW_MASK_DISABLE);
2094 
2095    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2096 
2097    /* set message header global offset field (reg 0, element 2) */
2098    brw_MOV(p,
2099 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2100 			       mrf.nr,
2101 			       2), BRW_REGISTER_TYPE_UD),
2102 	   brw_imm_ud(offset));
2103 
2104    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2105    insn->header.destreg__conditionalmod = mrf.nr;
2106 
2107    /* cast dest to a uword[8] vector */
2108    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2109 
2110    brw_set_dest(p, insn, dest);
2111    if (intel->gen >= 6) {
2112       brw_set_src0(p, insn, mrf);
2113    } else {
2114       brw_set_src0(p, insn, brw_null_reg());
2115    }
2116 
2117    brw_set_dp_read_message(p,
2118 			   insn,
2119 			   bind_table_index,
2120 			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2121 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2122 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2123 			   1, /* msg_length */
2124                            true, /* header_present */
2125 			   1); /* response_length (1 reg, 2 owords!) */
2126 
2127    brw_pop_insn_state(p);
2128 }
2129 
2130 
brw_fb_WRITE(struct brw_compile * p,int dispatch_width,unsigned msg_reg_nr,struct brw_reg src0,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool header_present)2131 void brw_fb_WRITE(struct brw_compile *p,
2132 		  int dispatch_width,
2133                   unsigned msg_reg_nr,
2134                   struct brw_reg src0,
2135                   unsigned msg_control,
2136                   unsigned binding_table_index,
2137                   unsigned msg_length,
2138                   unsigned response_length,
2139                   bool eot,
2140                   bool header_present)
2141 {
2142    struct intel_context *intel = &p->brw->intel;
2143    struct brw_instruction *insn;
2144    unsigned msg_type;
2145    struct brw_reg dest;
2146 
2147    if (dispatch_width == 16)
2148       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2149    else
2150       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2151 
2152    if (intel->gen >= 6) {
2153       insn = next_insn(p, BRW_OPCODE_SENDC);
2154    } else {
2155       insn = next_insn(p, BRW_OPCODE_SEND);
2156    }
2157    /* The execution mask is ignored for render target writes. */
2158    insn->header.predicate_control = 0;
2159    insn->header.compression_control = BRW_COMPRESSION_NONE;
2160 
2161    if (intel->gen >= 6) {
2162       /* headerless version, just submit color payload */
2163       src0 = brw_message_reg(msg_reg_nr);
2164 
2165       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2166    } else {
2167       insn->header.destreg__conditionalmod = msg_reg_nr;
2168 
2169       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2170    }
2171 
2172    brw_set_dest(p, insn, dest);
2173    brw_set_src0(p, insn, src0);
2174    brw_set_dp_write_message(p,
2175 			    insn,
2176 			    binding_table_index,
2177 			    msg_control,
2178 			    msg_type,
2179 			    msg_length,
2180 			    header_present,
2181 			    eot, /* last render target write */
2182 			    response_length,
2183 			    eot,
2184 			    0 /* send_commit_msg */);
2185 }
2186 
2187 
2188 /**
2189  * Texture sample instruction.
2190  * Note: the msg_type plus msg_length values determine exactly what kind
2191  * of sampling operation is performed.  See volume 4, page 161 of docs.
2192  */
brw_SAMPLE(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned writemask,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2193 void brw_SAMPLE(struct brw_compile *p,
2194 		struct brw_reg dest,
2195 		unsigned msg_reg_nr,
2196 		struct brw_reg src0,
2197 		unsigned binding_table_index,
2198 		unsigned sampler,
2199 		unsigned writemask,
2200 		unsigned msg_type,
2201 		unsigned response_length,
2202 		unsigned msg_length,
2203 		unsigned header_present,
2204 		unsigned simd_mode,
2205 		unsigned return_format)
2206 {
2207    struct intel_context *intel = &p->brw->intel;
2208    bool need_stall = 0;
2209 
2210    if (writemask == 0) {
2211       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2212       return;
2213    }
2214 
2215    /* Hardware doesn't do destination dependency checking on send
2216     * instructions properly.  Add a workaround which generates the
2217     * dependency by other means.  In practice it seems like this bug
2218     * only crops up for texture samples, and only where registers are
2219     * written by the send and then written again later without being
2220     * read in between.  Luckily for us, we already track that
2221     * information and use it to modify the writemask for the
2222     * instruction, so that is a guide for whether a workaround is
2223     * needed.
2224     */
2225    if (writemask != BRW_WRITEMASK_XYZW) {
2226       unsigned dst_offset = 0;
2227       unsigned i, newmask = 0, len = 0;
2228 
2229       for (i = 0; i < 4; i++) {
2230 	 if (writemask & (1<<i))
2231 	    break;
2232 	 dst_offset += 2;
2233       }
2234       for (; i < 4; i++) {
2235 	 if (!(writemask & (1<<i)))
2236 	    break;
2237 	 newmask |= 1<<i;
2238 	 len++;
2239       }
2240 
2241       if (newmask != writemask) {
2242 	 need_stall = 1;
2243          /* printf("need stall %x %x\n", newmask , writemask); */
2244       }
2245       else {
2246 	 bool dispatch_16 = false;
2247 
2248 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2249 
2250 	 guess_execution_size(p, p->current, dest);
2251 	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2252 	    dispatch_16 = true;
2253 
2254 	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
2255 
2256 	 brw_push_insn_state(p);
2257 
2258 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2259 	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2260 
2261 	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2262 		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2263   	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2264 
2265 	 brw_pop_insn_state(p);
2266 
2267   	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2268 	 dest = offset(dest, dst_offset);
2269 
2270 	 /* For 16-wide dispatch, masked channels are skipped in the
2271 	  * response.  For 8-wide, masked channels still take up slots,
2272 	  * and are just not written to.
2273 	  */
2274 	 if (dispatch_16)
2275 	    response_length = len * 2;
2276       }
2277    }
2278 
2279    {
2280       struct brw_instruction *insn;
2281 
2282       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2283 
2284       insn = next_insn(p, BRW_OPCODE_SEND);
2285       insn->header.predicate_control = 0; /* XXX */
2286       insn->header.compression_control = BRW_COMPRESSION_NONE;
2287       if (intel->gen < 6)
2288 	  insn->header.destreg__conditionalmod = msg_reg_nr;
2289 
2290       brw_set_dest(p, insn, dest);
2291       brw_set_src0(p, insn, src0);
2292       brw_set_sampler_message(p, insn,
2293 			      binding_table_index,
2294 			      sampler,
2295 			      msg_type,
2296 			      response_length,
2297 			      msg_length,
2298 			      header_present,
2299 			      simd_mode,
2300 			      return_format);
2301    }
2302 
2303    if (need_stall) {
2304       struct brw_reg reg = vec8(offset(dest, response_length-1));
2305 
2306       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2307        */
2308       brw_push_insn_state(p);
2309       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2310       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2311 	      retype(reg, BRW_REGISTER_TYPE_UD));
2312       brw_pop_insn_state(p);
2313    }
2314 
2315 }
2316 
2317 /* All these variables are pretty confusing - we might be better off
2318  * using bitmasks and macros for this, in the old style.  Or perhaps
2319  * just having the caller instantiate the fields in dword3 itself.
2320  */
brw_urb_WRITE(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,bool used,unsigned msg_length,unsigned response_length,bool eot,bool writes_complete,unsigned offset,unsigned swizzle)2321 void brw_urb_WRITE(struct brw_compile *p,
2322 		   struct brw_reg dest,
2323 		   unsigned msg_reg_nr,
2324 		   struct brw_reg src0,
2325 		   bool allocate,
2326 		   bool used,
2327 		   unsigned msg_length,
2328 		   unsigned response_length,
2329 		   bool eot,
2330 		   bool writes_complete,
2331 		   unsigned offset,
2332 		   unsigned swizzle)
2333 {
2334    struct intel_context *intel = &p->brw->intel;
2335    struct brw_instruction *insn;
2336 
2337    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2338 
2339    if (intel->gen == 7) {
2340       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2341       brw_push_insn_state(p);
2342       brw_set_access_mode(p, BRW_ALIGN_1);
2343       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2344 		       BRW_REGISTER_TYPE_UD),
2345 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2346 		brw_imm_ud(0xff00));
2347       brw_pop_insn_state(p);
2348    }
2349 
2350    insn = next_insn(p, BRW_OPCODE_SEND);
2351 
2352    assert(msg_length < BRW_MAX_MRF);
2353 
2354    brw_set_dest(p, insn, dest);
2355    brw_set_src0(p, insn, src0);
2356    brw_set_src1(p, insn, brw_imm_d(0));
2357 
2358    if (intel->gen < 6)
2359       insn->header.destreg__conditionalmod = msg_reg_nr;
2360 
2361    brw_set_urb_message(p,
2362 		       insn,
2363 		       allocate,
2364 		       used,
2365 		       msg_length,
2366 		       response_length,
2367 		       eot,
2368 		       writes_complete,
2369 		       offset,
2370 		       swizzle);
2371 }
2372 
2373 static int
next_ip(struct brw_compile * p,int ip)2374 next_ip(struct brw_compile *p, int ip)
2375 {
2376    struct brw_instruction *insn = (void *)p->store + ip;
2377 
2378    if (insn->header.cmpt_control)
2379       return ip + 8;
2380    else
2381       return ip + 16;
2382 }
2383 
2384 static int
brw_find_next_block_end(struct brw_compile * p,int start)2385 brw_find_next_block_end(struct brw_compile *p, int start)
2386 {
2387    int ip;
2388    void *store = p->store;
2389 
2390    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2391       struct brw_instruction *insn = store + ip;
2392 
2393       switch (insn->header.opcode) {
2394       case BRW_OPCODE_ENDIF:
2395       case BRW_OPCODE_ELSE:
2396       case BRW_OPCODE_WHILE:
2397       case BRW_OPCODE_HALT:
2398 	 return ip;
2399       }
2400    }
2401 
2402    return 0;
2403 }
2404 
2405 /* There is no DO instruction on gen6, so to find the end of the loop
2406  * we have to see if the loop is jumping back before our start
2407  * instruction.
2408  */
2409 static int
brw_find_loop_end(struct brw_compile * p,int start)2410 brw_find_loop_end(struct brw_compile *p, int start)
2411 {
2412    struct intel_context *intel = &p->brw->intel;
2413    int ip;
2414    int scale = 8;
2415    void *store = p->store;
2416 
2417    /* Always start after the instruction (such as a WHILE) we're trying to fix
2418     * up.
2419     */
2420    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2421       struct brw_instruction *insn = store + ip;
2422 
2423       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2424 	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2425 				   : insn->bits3.break_cont.jip;
2426 	 if (ip + jip * scale <= start)
2427 	    return ip;
2428       }
2429    }
2430    assert(!"not reached");
2431    return start;
2432 }
2433 
2434 /* After program generation, go back and update the UIP and JIP of
2435  * BREAK, CONT, and HALT instructions to their correct locations.
2436  */
2437 void
brw_set_uip_jip(struct brw_compile * p)2438 brw_set_uip_jip(struct brw_compile *p)
2439 {
2440    struct intel_context *intel = &p->brw->intel;
2441    int ip;
2442    int scale = 8;
2443    void *store = p->store;
2444 
2445    if (intel->gen < 6)
2446       return;
2447 
2448    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2449       struct brw_instruction *insn = store + ip;
2450 
2451       if (insn->header.cmpt_control) {
2452 	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2453 	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2454 		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2455 		insn->header.opcode != BRW_OPCODE_HALT);
2456 	 continue;
2457       }
2458 
2459       int block_end_ip = brw_find_next_block_end(p, ip);
2460       switch (insn->header.opcode) {
2461       case BRW_OPCODE_BREAK:
2462          assert(block_end_ip != 0);
2463 	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2464 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2465 	 insn->bits3.break_cont.uip =
2466 	    (brw_find_loop_end(p, ip) - ip +
2467              (intel->gen == 6 ? 16 : 0)) / scale;
2468 	 break;
2469       case BRW_OPCODE_CONTINUE:
2470          assert(block_end_ip != 0);
2471 	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2472 	 insn->bits3.break_cont.uip =
2473             (brw_find_loop_end(p, ip) - ip) / scale;
2474 
2475 	 assert(insn->bits3.break_cont.uip != 0);
2476 	 assert(insn->bits3.break_cont.jip != 0);
2477 	 break;
2478 
2479       case BRW_OPCODE_ENDIF:
2480          if (block_end_ip == 0)
2481             insn->bits3.break_cont.jip = 2;
2482          else
2483             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2484 	 break;
2485 
2486       case BRW_OPCODE_HALT:
2487 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2488 	  *
2489 	  *    "In case of the halt instruction not inside any conditional
2490 	  *     code block, the value of <JIP> and <UIP> should be the
2491 	  *     same. In case of the halt instruction inside conditional code
2492 	  *     block, the <UIP> should be the end of the program, and the
2493 	  *     <JIP> should be end of the most inner conditional code block."
2494 	  *
2495 	  * The uip will have already been set by whoever set up the
2496 	  * instruction.
2497 	  */
2498 	 if (block_end_ip == 0) {
2499 	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2500 	 } else {
2501 	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2502 	 }
2503 	 assert(insn->bits3.break_cont.uip != 0);
2504 	 assert(insn->bits3.break_cont.jip != 0);
2505 	 break;
2506       }
2507    }
2508 }
2509 
brw_ff_sync(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)2510 void brw_ff_sync(struct brw_compile *p,
2511 		   struct brw_reg dest,
2512 		   unsigned msg_reg_nr,
2513 		   struct brw_reg src0,
2514 		   bool allocate,
2515 		   unsigned response_length,
2516 		   bool eot)
2517 {
2518    struct intel_context *intel = &p->brw->intel;
2519    struct brw_instruction *insn;
2520 
2521    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2522 
2523    insn = next_insn(p, BRW_OPCODE_SEND);
2524    brw_set_dest(p, insn, dest);
2525    brw_set_src0(p, insn, src0);
2526    brw_set_src1(p, insn, brw_imm_d(0));
2527 
2528    if (intel->gen < 6)
2529       insn->header.destreg__conditionalmod = msg_reg_nr;
2530 
2531    brw_set_ff_sync_message(p,
2532 			   insn,
2533 			   allocate,
2534 			   response_length,
2535 			   eot);
2536 }
2537 
2538 /**
2539  * Emit the SEND instruction necessary to generate stream output data on Gen6
2540  * (for transform feedback).
2541  *
2542  * If send_commit_msg is true, this is the last piece of stream output data
2543  * from this thread, so send the data as a committed write.  According to the
2544  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2545  *
2546  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2547  *   writes are complete by sending the final write as a committed write."
2548  */
2549 void
brw_svb_write(struct brw_compile * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)2550 brw_svb_write(struct brw_compile *p,
2551               struct brw_reg dest,
2552               unsigned msg_reg_nr,
2553               struct brw_reg src0,
2554               unsigned binding_table_index,
2555               bool   send_commit_msg)
2556 {
2557    struct brw_instruction *insn;
2558 
2559    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2560 
2561    insn = next_insn(p, BRW_OPCODE_SEND);
2562    brw_set_dest(p, insn, dest);
2563    brw_set_src0(p, insn, src0);
2564    brw_set_src1(p, insn, brw_imm_d(0));
2565    brw_set_dp_write_message(p, insn,
2566                             binding_table_index,
2567                             0, /* msg_control: ignored */
2568                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2569                             1, /* msg_length */
2570                             true, /* header_present */
2571                             0, /* last_render_target: ignored */
2572                             send_commit_msg, /* response_length */
2573                             0, /* end_of_thread */
2574                             send_commit_msg); /* send_commit_msg */
2575 }
2576 
2577 /**
2578  * This instruction is generated as a single-channel align1 instruction by
2579  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2580  *
2581  * We can't use the typed atomic op in the FS because that has the execution
2582  * mask ANDed with the pixel mask, but we just want to write the one dword for
2583  * all the pixels.
2584  *
2585  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2586  * one u32.  So we use the same untyped atomic write message as the pixel
2587  * shader.
2588  *
2589  * The untyped atomic operation requires a BUFFER surface type with RAW
2590  * format, and is only accessible through the legacy DATA_CACHE dataport
2591  * messages.
2592  */
brw_shader_time_add(struct brw_compile * p,int base_mrf,uint32_t surf_index)2593 void brw_shader_time_add(struct brw_compile *p,
2594                          int base_mrf,
2595                          uint32_t surf_index)
2596 {
2597    struct intel_context *intel = &p->brw->intel;
2598    assert(intel->gen >= 7);
2599 
2600    brw_push_insn_state(p);
2601    brw_set_access_mode(p, BRW_ALIGN_1);
2602    brw_set_mask_control(p, BRW_MASK_DISABLE);
2603    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2604    brw_pop_insn_state(p);
2605 
2606    /* We use brw_vec1_reg and unmasked because we want to increment the given
2607     * offset only once.
2608     */
2609    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2610                                       BRW_ARF_NULL, 0));
2611    brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2612                                       base_mrf, 0));
2613 
2614    bool header_present = false;
2615    bool eot = false;
2616    uint32_t mlen = 2; /* offset, value */
2617    uint32_t rlen = 0;
2618    brw_set_message_descriptor(p, send,
2619                               GEN7_SFID_DATAPORT_DATA_CACHE,
2620                               mlen, rlen, header_present, eot);
2621 
2622    send->bits3.ud |= 6 << 14; /* untyped atomic op */
2623    send->bits3.ud |= 0 << 13; /* no return data */
2624    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2625    send->bits3.ud |= BRW_AOP_ADD << 8;
2626    send->bits3.ud |= surf_index << 0;
2627 }
2628