1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27 
28 #include "brw_fs.h"
29 #include "glsl/glsl_types.h"
30 #include "glsl/ir_optimization.h"
31 #include "glsl/ir_print_visitor.h"
32 
33 /** @file brw_fs_schedule_instructions.cpp
34  *
35  * List scheduling of FS instructions.
36  *
37  * The basic model of the list scheduler is to take a basic block,
38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
39  * ordering, WAR ordering), and make a list of the DAG heads.
40  * Heuristically pick a DAG head, then put all the children that are
41  * now DAG heads into the list of things to schedule.
42  *
43  * The heuristic is the important part.  We're trying to be cheap,
44  * since actually computing the optimal scheduling is NP complete.
45  * What we do is track a "current clock".  When we schedule a node, we
46  * update the earliest-unblocked clock time of its children, and
47  * increment the clock.  Then, when trying to schedule, we just pick
48  * the earliest-unblocked instruction to schedule.
49  *
50  * Note that often there will be many things which could execute
51  * immediately, and there are a range of heuristic options to choose
52  * from in picking among those.
53  */
54 
55 class schedule_node : public exec_node
56 {
57 public:
schedule_node(fs_inst * inst)58    schedule_node(fs_inst *inst)
59    {
60       this->inst = inst;
61       this->child_array_size = 0;
62       this->children = NULL;
63       this->child_latency = NULL;
64       this->child_count = 0;
65       this->parent_count = 0;
66       this->unblocked_time = 0;
67 
68       int chans = 8;
69       int math_latency = 22;
70 
71       switch (inst->opcode) {
72       case SHADER_OPCODE_RCP:
73 	 this->latency = 1 * chans * math_latency;
74 	 break;
75       case SHADER_OPCODE_RSQ:
76 	 this->latency = 2 * chans * math_latency;
77 	 break;
78       case SHADER_OPCODE_INT_QUOTIENT:
79       case SHADER_OPCODE_SQRT:
80       case SHADER_OPCODE_LOG2:
81 	 /* full precision log.  partial is 2. */
82 	 this->latency = 3 * chans * math_latency;
83 	 break;
84       case SHADER_OPCODE_INT_REMAINDER:
85       case SHADER_OPCODE_EXP2:
86 	 /* full precision.  partial is 3, same throughput. */
87 	 this->latency = 4 * chans * math_latency;
88 	 break;
89       case SHADER_OPCODE_POW:
90 	 this->latency = 8 * chans * math_latency;
91 	 break;
92       case SHADER_OPCODE_SIN:
93       case SHADER_OPCODE_COS:
94 	 /* minimum latency, max is 12 rounds. */
95 	 this->latency = 5 * chans * math_latency;
96 	 break;
97       default:
98 	 this->latency = 2;
99 	 break;
100       }
101    }
102 
103    fs_inst *inst;
104    schedule_node **children;
105    int *child_latency;
106    int child_count;
107    int parent_count;
108    int child_array_size;
109    int unblocked_time;
110    int latency;
111 };
112 
113 class instruction_scheduler {
114 public:
instruction_scheduler(fs_visitor * v,void * mem_ctx,int virtual_grf_count)115    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
116    {
117       this->v = v;
118       this->mem_ctx = ralloc_context(mem_ctx);
119       this->virtual_grf_count = virtual_grf_count;
120       this->instructions.make_empty();
121       this->instructions_to_schedule = 0;
122    }
123 
~instruction_scheduler()124    ~instruction_scheduler()
125    {
126       ralloc_free(this->mem_ctx);
127    }
128    void add_barrier_deps(schedule_node *n);
129    void add_dep(schedule_node *before, schedule_node *after, int latency);
130    void add_dep(schedule_node *before, schedule_node *after);
131 
132    void add_inst(fs_inst *inst);
133    void calculate_deps();
134    void schedule_instructions(fs_inst *next_block_header);
135 
136    bool is_compressed(fs_inst *inst);
137 
138    void *mem_ctx;
139 
140    int instructions_to_schedule;
141    int virtual_grf_count;
142    exec_list instructions;
143    fs_visitor *v;
144 };
145 
146 void
add_inst(fs_inst * inst)147 instruction_scheduler::add_inst(fs_inst *inst)
148 {
149    schedule_node *n = new(mem_ctx) schedule_node(inst);
150 
151    assert(!inst->is_head_sentinel());
152    assert(!inst->is_tail_sentinel());
153 
154    this->instructions_to_schedule++;
155 
156    inst->remove();
157    instructions.push_tail(n);
158 }
159 
160 /**
161  * Add a dependency between two instruction nodes.
162  *
163  * The @after node will be scheduled after @before.  We will try to
164  * schedule it @latency cycles after @before, but no guarantees there.
165  */
166 void
add_dep(schedule_node * before,schedule_node * after,int latency)167 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
168 			       int latency)
169 {
170    if (!before || !after)
171       return;
172 
173    assert(before != after);
174 
175    for (int i = 0; i < before->child_count; i++) {
176       if (before->children[i] == after) {
177 	 before->child_latency[i] = MAX2(before->child_latency[i], latency);
178 	 return;
179       }
180    }
181 
182    if (before->child_array_size <= before->child_count) {
183       if (before->child_array_size < 16)
184 	 before->child_array_size = 16;
185       else
186 	 before->child_array_size *= 2;
187 
188       before->children = reralloc(mem_ctx, before->children,
189 				  schedule_node *,
190 				  before->child_array_size);
191       before->child_latency = reralloc(mem_ctx, before->child_latency,
192 				       int, before->child_array_size);
193    }
194 
195    before->children[before->child_count] = after;
196    before->child_latency[before->child_count] = latency;
197    before->child_count++;
198    after->parent_count++;
199 }
200 
201 void
add_dep(schedule_node * before,schedule_node * after)202 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
203 {
204    if (!before)
205       return;
206 
207    add_dep(before, after, before->latency);
208 }
209 
210 /**
211  * Sometimes we really want this node to execute after everything that
212  * was before it and before everything that followed it.  This adds
213  * the deps to do so.
214  */
215 void
add_barrier_deps(schedule_node * n)216 instruction_scheduler::add_barrier_deps(schedule_node *n)
217 {
218    schedule_node *prev = (schedule_node *)n->prev;
219    schedule_node *next = (schedule_node *)n->next;
220 
221    if (prev) {
222       while (!prev->is_head_sentinel()) {
223 	 add_dep(prev, n, 0);
224 	 prev = (schedule_node *)prev->prev;
225       }
226    }
227 
228    if (next) {
229       while (!next->is_tail_sentinel()) {
230 	 add_dep(n, next, 0);
231 	 next = (schedule_node *)next->next;
232       }
233    }
234 }
235 
236 /* instruction scheduling needs to be aware of when an MRF write
237  * actually writes 2 MRFs.
238  */
239 bool
is_compressed(fs_inst * inst)240 instruction_scheduler::is_compressed(fs_inst *inst)
241 {
242    return (v->c->dispatch_width == 16 &&
243 	   !inst->force_uncompressed &&
244 	   !inst->force_sechalf);
245 }
246 
247 void
calculate_deps()248 instruction_scheduler::calculate_deps()
249 {
250    schedule_node *last_grf_write[virtual_grf_count];
251    schedule_node *last_mrf_write[BRW_MAX_MRF];
252    schedule_node *last_conditional_mod = NULL;
253    /* Fixed HW registers are assumed to be separate from the virtual
254     * GRFs, so they can be tracked separately.  We don't really write
255     * to fixed GRFs much, so don't bother tracking them on a more
256     * granular level.
257     */
258    schedule_node *last_fixed_grf_write = NULL;
259 
260    /* The last instruction always needs to still be the last
261     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
262     * WHILE) and scheduling other things after it would disturb the
263     * basic block, or it's FB_WRITE and we should do a better job at
264     * dead code elimination anyway.
265     */
266    schedule_node *last = (schedule_node *)instructions.get_tail();
267    add_barrier_deps(last);
268 
269    memset(last_grf_write, 0, sizeof(last_grf_write));
270    memset(last_mrf_write, 0, sizeof(last_mrf_write));
271 
272    /* top-to-bottom dependencies: RAW and WAW. */
273    foreach_list(node, &instructions) {
274       schedule_node *n = (schedule_node *)node;
275       fs_inst *inst = n->inst;
276 
277       /* read-after-write deps. */
278       for (int i = 0; i < 3; i++) {
279 	 if (inst->src[i].file == GRF) {
280 	    add_dep(last_grf_write[inst->src[i].reg], n);
281 	 } else if (inst->src[i].file == FIXED_HW_REG &&
282 		    (inst->src[i].fixed_hw_reg.file ==
283 		     BRW_GENERAL_REGISTER_FILE)) {
284 	    add_dep(last_fixed_grf_write, n);
285 	 } else if (inst->src[i].file != BAD_FILE &&
286 		    inst->src[i].file != IMM &&
287 		    inst->src[i].file != UNIFORM) {
288 	    assert(inst->src[i].file != MRF);
289 	    add_barrier_deps(n);
290 	 }
291       }
292 
293       for (int i = 0; i < inst->mlen; i++) {
294 	 /* It looks like the MRF regs are released in the send
295 	  * instruction once it's sent, not when the result comes
296 	  * back.
297 	  */
298 	 add_dep(last_mrf_write[inst->base_mrf + i], n);
299       }
300 
301       if (inst->predicated) {
302 	 assert(last_conditional_mod);
303 	 add_dep(last_conditional_mod, n);
304       }
305 
306       /* write-after-write deps. */
307       if (inst->dst.file == GRF) {
308 	 add_dep(last_grf_write[inst->dst.reg], n);
309 	 last_grf_write[inst->dst.reg] = n;
310       } else if (inst->dst.file == MRF) {
311 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
312 
313 	 add_dep(last_mrf_write[reg], n);
314 	 last_mrf_write[reg] = n;
315 	 if (is_compressed(inst)) {
316 	    if (inst->dst.reg & BRW_MRF_COMPR4)
317 	       reg += 4;
318 	    else
319 	       reg++;
320 	    add_dep(last_mrf_write[reg], n);
321 	    last_mrf_write[reg] = n;
322 	 }
323       } else if (inst->dst.file == FIXED_HW_REG &&
324 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
325 	 last_fixed_grf_write = n;
326       } else if (inst->dst.file != BAD_FILE) {
327 	 add_barrier_deps(n);
328       }
329 
330       if (inst->mlen > 0) {
331 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
332 	    add_dep(last_mrf_write[inst->base_mrf + i], n);
333 	    last_mrf_write[inst->base_mrf + i] = n;
334 	 }
335       }
336 
337       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
338        * conditional_mod, because it sets the flag register.
339        */
340       if (inst->conditional_mod ||
341           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
342 	 add_dep(last_conditional_mod, n, 0);
343 	 last_conditional_mod = n;
344       }
345    }
346 
347    /* bottom-to-top dependencies: WAR */
348    memset(last_grf_write, 0, sizeof(last_grf_write));
349    memset(last_mrf_write, 0, sizeof(last_mrf_write));
350    last_conditional_mod = NULL;
351    last_fixed_grf_write = NULL;
352 
353    exec_node *node;
354    exec_node *prev;
355    for (node = instructions.get_tail(), prev = node->prev;
356 	!node->is_head_sentinel();
357 	node = prev, prev = node->prev) {
358       schedule_node *n = (schedule_node *)node;
359       fs_inst *inst = n->inst;
360 
361       /* write-after-read deps. */
362       for (int i = 0; i < 3; i++) {
363 	 if (inst->src[i].file == GRF) {
364 	    add_dep(n, last_grf_write[inst->src[i].reg]);
365 	 } else if (inst->src[i].file == FIXED_HW_REG &&
366 		    (inst->src[i].fixed_hw_reg.file ==
367 		     BRW_GENERAL_REGISTER_FILE)) {
368 	    add_dep(n, last_fixed_grf_write);
369 	 } else if (inst->src[i].file != BAD_FILE &&
370 		    inst->src[i].file != IMM &&
371 		    inst->src[i].file != UNIFORM) {
372 	    assert(inst->src[i].file != MRF);
373 	    add_barrier_deps(n);
374 	 }
375       }
376 
377       for (int i = 0; i < inst->mlen; i++) {
378 	 /* It looks like the MRF regs are released in the send
379 	  * instruction once it's sent, not when the result comes
380 	  * back.
381 	  */
382 	 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
383       }
384 
385       if (inst->predicated) {
386 	 add_dep(n, last_conditional_mod);
387       }
388 
389       /* Update the things this instruction wrote, so earlier reads
390        * can mark this as WAR dependency.
391        */
392       if (inst->dst.file == GRF) {
393 	 last_grf_write[inst->dst.reg] = n;
394       } else if (inst->dst.file == MRF) {
395 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
396 
397 	 last_mrf_write[reg] = n;
398 
399 	 if (is_compressed(inst)) {
400 	    if (inst->dst.reg & BRW_MRF_COMPR4)
401 	       reg += 4;
402 	    else
403 	       reg++;
404 
405 	    last_mrf_write[reg] = n;
406 	 }
407       } else if (inst->dst.file == FIXED_HW_REG &&
408 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
409 	 last_fixed_grf_write = n;
410       } else if (inst->dst.file != BAD_FILE) {
411 	 add_barrier_deps(n);
412       }
413 
414       if (inst->mlen > 0) {
415 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
416 	    last_mrf_write[inst->base_mrf + i] = n;
417 	 }
418       }
419 
420       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
421        * conditional_mod, because it sets the flag register.
422        */
423       if (inst->conditional_mod ||
424           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
425 	 last_conditional_mod = n;
426       }
427    }
428 }
429 
430 void
schedule_instructions(fs_inst * next_block_header)431 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
432 {
433    int time = 0;
434 
435    /* Remove non-DAG heads from the list. */
436    foreach_list_safe(node, &instructions) {
437       schedule_node *n = (schedule_node *)node;
438       if (n->parent_count != 0)
439 	 n->remove();
440    }
441 
442    while (!instructions.is_empty()) {
443       schedule_node *chosen = NULL;
444       int chosen_time = 0;
445 
446       foreach_list(node, &instructions) {
447 	 schedule_node *n = (schedule_node *)node;
448 
449 	 if (!chosen || n->unblocked_time < chosen_time) {
450 	    chosen = n;
451 	    chosen_time = n->unblocked_time;
452 	 }
453       }
454 
455       /* Schedule this instruction. */
456       assert(chosen);
457       chosen->remove();
458       next_block_header->insert_before(chosen->inst);
459       instructions_to_schedule--;
460 
461       /* Bump the clock.  If we expected a delay for scheduling, then
462        * bump the clock to reflect that.
463        */
464       time = MAX2(time + 1, chosen_time);
465 
466       /* Now that we've scheduled a new instruction, some of its
467        * children can be promoted to the list of instructions ready to
468        * be scheduled.  Update the children's unblocked time for this
469        * DAG edge as we do so.
470        */
471       for (int i = 0; i < chosen->child_count; i++) {
472 	 schedule_node *child = chosen->children[i];
473 
474 	 child->unblocked_time = MAX2(child->unblocked_time,
475 				      time + chosen->child_latency[i]);
476 
477 	 child->parent_count--;
478 	 if (child->parent_count == 0) {
479 	    instructions.push_tail(child);
480 	 }
481       }
482 
483       /* Shared resource: the mathbox.  There's one per EU (on later
484        * generations, it's even more limited pre-gen6), so if we send
485        * something off to it then the next math isn't going to make
486        * progress until the first is done.
487        */
488       if (chosen->inst->is_math()) {
489 	 foreach_list(node, &instructions) {
490 	    schedule_node *n = (schedule_node *)node;
491 
492 	    if (n->inst->is_math())
493 	       n->unblocked_time = MAX2(n->unblocked_time,
494 					time + chosen->latency);
495 	 }
496       }
497    }
498 
499    assert(instructions_to_schedule == 0);
500 }
501 
502 void
schedule_instructions()503 fs_visitor::schedule_instructions()
504 {
505    fs_inst *next_block_header = (fs_inst *)instructions.head;
506    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_count);
507 
508    while (!next_block_header->is_tail_sentinel()) {
509       /* Add things to be scheduled until we get to a new BB. */
510       while (!next_block_header->is_tail_sentinel()) {
511 	 fs_inst *inst = next_block_header;
512 	 next_block_header = (fs_inst *)next_block_header->next;
513 
514 	 sched.add_inst(inst);
515 	 if (inst->opcode == BRW_OPCODE_IF ||
516 	     inst->opcode == BRW_OPCODE_ELSE ||
517 	     inst->opcode == BRW_OPCODE_ENDIF ||
518 	     inst->opcode == BRW_OPCODE_DO ||
519 	     inst->opcode == BRW_OPCODE_WHILE ||
520 	     inst->opcode == BRW_OPCODE_BREAK ||
521 	     inst->opcode == BRW_OPCODE_CONTINUE) {
522 	    break;
523 	 }
524       }
525       sched.calculate_deps();
526       sched.schedule_instructions(next_block_header);
527    }
528 
529    this->live_intervals_valid = false;
530 }
531