1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50_ir.h"
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
26
27 extern "C" {
28 #include "util/u_math.h"
29 }
30
31 namespace nv50_ir {
32
33 bool
isNop() const34 Instruction::isNop() const
35 {
36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37 return true;
38 if (terminator || join) // XXX: should terminator imply flow ?
39 return false;
40 if (!fixed && op == OP_NOP)
41 return true;
42
43 if (defExists(0) && def(0).rep()->reg.data.id < 0) {
44 for (int d = 1; defExists(d); ++d)
45 if (def(d).rep()->reg.data.id >= 0)
46 WARN("part of vector result is unused !\n");
47 return true;
48 }
49
50 if (op == OP_MOV || op == OP_UNION) {
51 if (!getDef(0)->equals(getSrc(0)))
52 return false;
53 if (op == OP_UNION)
54 if (!def(0).rep()->equals(getSrc(1)))
55 return false;
56 return true;
57 }
58
59 return false;
60 }
61
isDead() const62 bool Instruction::isDead() const
63 {
64 if (op == OP_STORE ||
65 op == OP_EXPORT ||
66 op == OP_WRSV)
67 return false;
68
69 for (int d = 0; defExists(d); ++d)
70 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
71 return false;
72
73 if (terminator || asFlow())
74 return false;
75 if (fixed)
76 return false;
77
78 return true;
79 };
80
81 // =============================================================================
82
83 class CopyPropagation : public Pass
84 {
85 private:
86 virtual bool visit(BasicBlock *);
87 };
88
89 // Propagate all MOVs forward to make subsequent optimization easier, except if
90 // the sources stem from a phi, in which case we don't want to mess up potential
91 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92 bool
visit(BasicBlock * bb)93 CopyPropagation::visit(BasicBlock *bb)
94 {
95 Instruction *mov, *si, *next;
96
97 for (mov = bb->getEntry(); mov; mov = next) {
98 next = mov->next;
99 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
100 continue;
101 if (mov->getPredicate())
102 continue;
103 if (mov->def(0).getFile() != mov->src(0).getFile())
104 continue;
105 si = mov->getSrc(0)->getInsn();
106 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
107 // propagate
108 mov->def(0).replace(mov->getSrc(0), false);
109 delete_Instruction(prog, mov);
110 }
111 }
112 return true;
113 }
114
115 // =============================================================================
116
117 class LoadPropagation : public Pass
118 {
119 private:
120 virtual bool visit(BasicBlock *);
121
122 void checkSwapSrc01(Instruction *);
123
124 bool isCSpaceLoad(Instruction *);
125 bool isImmd32Load(Instruction *);
126 bool isAttribOrSharedLoad(Instruction *);
127 };
128
129 bool
isCSpaceLoad(Instruction * ld)130 LoadPropagation::isCSpaceLoad(Instruction *ld)
131 {
132 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
133 }
134
135 bool
isImmd32Load(Instruction * ld)136 LoadPropagation::isImmd32Load(Instruction *ld)
137 {
138 if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
139 return false;
140 return ld->src(0).getFile() == FILE_IMMEDIATE;
141 }
142
143 bool
isAttribOrSharedLoad(Instruction * ld)144 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
145 {
146 return ld &&
147 (ld->op == OP_VFETCH ||
148 (ld->op == OP_LOAD &&
149 (ld->src(0).getFile() == FILE_SHADER_INPUT ||
150 ld->src(0).getFile() == FILE_MEMORY_SHARED)));
151 }
152
153 void
checkSwapSrc01(Instruction * insn)154 LoadPropagation::checkSwapSrc01(Instruction *insn)
155 {
156 if (!prog->getTarget()->getOpInfo(insn).commutative)
157 if (insn->op != OP_SET && insn->op != OP_SLCT)
158 return;
159 if (insn->src(1).getFile() != FILE_GPR)
160 return;
161
162 Instruction *i0 = insn->getSrc(0)->getInsn();
163 Instruction *i1 = insn->getSrc(1)->getInsn();
164
165 if (isCSpaceLoad(i0)) {
166 if (!isCSpaceLoad(i1))
167 insn->swapSources(0, 1);
168 else
169 return;
170 } else
171 if (isImmd32Load(i0)) {
172 if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
173 insn->swapSources(0, 1);
174 else
175 return;
176 } else
177 if (isAttribOrSharedLoad(i1)) {
178 if (!isAttribOrSharedLoad(i0))
179 insn->swapSources(0, 1);
180 else
181 return;
182 } else {
183 return;
184 }
185
186 if (insn->op == OP_SET)
187 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
188 else
189 if (insn->op == OP_SLCT)
190 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
191 }
192
193 bool
visit(BasicBlock * bb)194 LoadPropagation::visit(BasicBlock *bb)
195 {
196 const Target *targ = prog->getTarget();
197 Instruction *next;
198
199 for (Instruction *i = bb->getEntry(); i; i = next) {
200 next = i->next;
201
202 if (i->srcExists(1))
203 checkSwapSrc01(i);
204
205 for (int s = 0; i->srcExists(s); ++s) {
206 Instruction *ld = i->getSrc(s)->getInsn();
207
208 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
209 continue;
210 if (!targ->insnCanLoad(i, s, ld))
211 continue;
212
213 // propagate !
214 i->setSrc(s, ld->getSrc(0));
215 if (ld->src(0).isIndirect(0))
216 i->setIndirect(s, 0, ld->getIndirect(0, 0));
217
218 if (ld->getDef(0)->refCount() == 0)
219 delete_Instruction(prog, ld);
220 }
221 }
222 return true;
223 }
224
225 // =============================================================================
226
227 // Evaluate constant expressions.
228 class ConstantFolding : public Pass
229 {
230 public:
231 bool foldAll(Program *);
232
233 private:
234 virtual bool visit(BasicBlock *);
235
236 void expr(Instruction *, ImmediateValue&, ImmediateValue&);
237 void opnd(Instruction *, ImmediateValue&, int s);
238
239 void unary(Instruction *, const ImmediateValue&);
240
241 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
242
243 // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
244 CmpInstruction *findOriginForTestWithZero(Value *);
245
246 unsigned int foldCount;
247
248 BuildUtil bld;
249 };
250
251 // TODO: remember generated immediates and only revisit these
252 bool
foldAll(Program * prog)253 ConstantFolding::foldAll(Program *prog)
254 {
255 unsigned int iterCount = 0;
256 do {
257 foldCount = 0;
258 if (!run(prog))
259 return false;
260 } while (foldCount && ++iterCount < 2);
261 return true;
262 }
263
264 bool
visit(BasicBlock * bb)265 ConstantFolding::visit(BasicBlock *bb)
266 {
267 Instruction *i, *next;
268
269 for (i = bb->getEntry(); i; i = next) {
270 next = i->next;
271 if (i->op == OP_MOV || i->op == OP_CALL)
272 continue;
273
274 ImmediateValue src0, src1;
275
276 if (i->srcExists(1) &&
277 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
278 expr(i, src0, src1);
279 else
280 if (i->srcExists(0) && i->src(0).getImmediate(src0))
281 opnd(i, src0, 0);
282 else
283 if (i->srcExists(1) && i->src(1).getImmediate(src1))
284 opnd(i, src1, 1);
285 }
286 return true;
287 }
288
289 CmpInstruction *
findOriginForTestWithZero(Value * value)290 ConstantFolding::findOriginForTestWithZero(Value *value)
291 {
292 if (!value)
293 return NULL;
294 Instruction *insn = value->getInsn();
295
296 while (insn && insn->op != OP_SET) {
297 Instruction *next = NULL;
298 switch (insn->op) {
299 case OP_NEG:
300 case OP_ABS:
301 case OP_CVT:
302 next = insn->getSrc(0)->getInsn();
303 if (insn->sType != next->dType)
304 return NULL;
305 break;
306 case OP_MOV:
307 next = insn->getSrc(0)->getInsn();
308 break;
309 default:
310 return NULL;
311 }
312 insn = next;
313 }
314 return insn ? insn->asCmp() : NULL;
315 }
316
317 void
applyTo(ImmediateValue & imm) const318 Modifier::applyTo(ImmediateValue& imm) const
319 {
320 switch (imm.reg.type) {
321 case TYPE_F32:
322 if (bits & NV50_IR_MOD_ABS)
323 imm.reg.data.f32 = fabsf(imm.reg.data.f32);
324 if (bits & NV50_IR_MOD_NEG)
325 imm.reg.data.f32 = -imm.reg.data.f32;
326 if (bits & NV50_IR_MOD_SAT) {
327 if (imm.reg.data.f32 < 0.0f)
328 imm.reg.data.f32 = 0.0f;
329 else
330 if (imm.reg.data.f32 > 1.0f)
331 imm.reg.data.f32 = 1.0f;
332 }
333 assert(!(bits & NV50_IR_MOD_NOT));
334 break;
335
336 case TYPE_S8: // NOTE: will be extended
337 case TYPE_S16:
338 case TYPE_S32:
339 case TYPE_U8: // NOTE: treated as signed
340 case TYPE_U16:
341 case TYPE_U32:
342 if (bits & NV50_IR_MOD_ABS)
343 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
344 imm.reg.data.s32 : -imm.reg.data.s32;
345 if (bits & NV50_IR_MOD_NEG)
346 imm.reg.data.s32 = -imm.reg.data.s32;
347 if (bits & NV50_IR_MOD_NOT)
348 imm.reg.data.s32 = ~imm.reg.data.s32;
349 break;
350
351 case TYPE_F64:
352 if (bits & NV50_IR_MOD_ABS)
353 imm.reg.data.f64 = fabs(imm.reg.data.f64);
354 if (bits & NV50_IR_MOD_NEG)
355 imm.reg.data.f64 = -imm.reg.data.f64;
356 if (bits & NV50_IR_MOD_SAT) {
357 if (imm.reg.data.f64 < 0.0)
358 imm.reg.data.f64 = 0.0;
359 else
360 if (imm.reg.data.f64 > 1.0)
361 imm.reg.data.f64 = 1.0;
362 }
363 assert(!(bits & NV50_IR_MOD_NOT));
364 break;
365
366 default:
367 assert(!"invalid/unhandled type");
368 imm.reg.data.u64 = 0;
369 break;
370 }
371 }
372
373 operation
getOp() const374 Modifier::getOp() const
375 {
376 switch (bits) {
377 case NV50_IR_MOD_ABS: return OP_ABS;
378 case NV50_IR_MOD_NEG: return OP_NEG;
379 case NV50_IR_MOD_SAT: return OP_SAT;
380 case NV50_IR_MOD_NOT: return OP_NOT;
381 case 0:
382 return OP_MOV;
383 default:
384 return OP_CVT;
385 }
386 }
387
388 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)389 ConstantFolding::expr(Instruction *i,
390 ImmediateValue &imm0, ImmediateValue &imm1)
391 {
392 struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
393 struct Storage res;
394
395 memset(&res.data, 0, sizeof(res.data));
396
397 switch (i->op) {
398 case OP_MAD:
399 case OP_FMA:
400 case OP_MUL:
401 if (i->dnz && i->dType == TYPE_F32) {
402 if (!isfinite(a->data.f32))
403 a->data.f32 = 0.0f;
404 if (!isfinite(b->data.f32))
405 b->data.f32 = 0.0f;
406 }
407 switch (i->dType) {
408 case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
409 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
410 case TYPE_S32:
411 case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
412 default:
413 return;
414 }
415 break;
416 case OP_DIV:
417 if (b->data.u32 == 0)
418 break;
419 switch (i->dType) {
420 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
421 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
422 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
423 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
424 default:
425 return;
426 }
427 break;
428 case OP_ADD:
429 switch (i->dType) {
430 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
431 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
432 case TYPE_S32:
433 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
434 default:
435 return;
436 }
437 break;
438 case OP_POW:
439 switch (i->dType) {
440 case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
441 case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
442 default:
443 return;
444 }
445 break;
446 case OP_MAX:
447 switch (i->dType) {
448 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
449 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
450 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
451 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
452 default:
453 return;
454 }
455 break;
456 case OP_MIN:
457 switch (i->dType) {
458 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
459 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
460 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
461 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
462 default:
463 return;
464 }
465 break;
466 case OP_AND:
467 res.data.u64 = a->data.u64 & b->data.u64;
468 break;
469 case OP_OR:
470 res.data.u64 = a->data.u64 | b->data.u64;
471 break;
472 case OP_XOR:
473 res.data.u64 = a->data.u64 ^ b->data.u64;
474 break;
475 case OP_SHL:
476 res.data.u32 = a->data.u32 << b->data.u32;
477 break;
478 case OP_SHR:
479 switch (i->dType) {
480 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
481 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
482 default:
483 return;
484 }
485 break;
486 case OP_SLCT:
487 if (a->data.u32 != b->data.u32)
488 return;
489 res.data.u32 = a->data.u32;
490 break;
491 default:
492 return;
493 }
494 ++foldCount;
495
496 i->src(0).mod = Modifier(0);
497 i->src(1).mod = Modifier(0);
498
499 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
500 i->setSrc(1, NULL);
501
502 i->getSrc(0)->reg.data = res.data;
503
504 if (i->op == OP_MAD || i->op == OP_FMA) {
505 i->op = OP_ADD;
506
507 i->setSrc(1, i->getSrc(0));
508 i->src(1).mod = i->src(2).mod;
509 i->setSrc(0, i->getSrc(2));
510 i->setSrc(2, NULL);
511
512 ImmediateValue src0;
513 if (i->src(0).getImmediate(src0))
514 expr(i, src0, *i->getSrc(1)->asImm());
515 } else {
516 i->op = OP_MOV;
517 }
518 }
519
520 void
unary(Instruction * i,const ImmediateValue & imm)521 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
522 {
523 Storage res;
524
525 if (i->dType != TYPE_F32)
526 return;
527 switch (i->op) {
528 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
529 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
530 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
531 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
532 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
533 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
534 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
535 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
536 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
537 case OP_PRESIN:
538 case OP_PREEX2:
539 // these should be handled in subsequent OP_SIN/COS/EX2
540 res.data.f32 = imm.reg.data.f32;
541 break;
542 default:
543 return;
544 }
545 i->op = OP_MOV;
546 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
547 i->src(0).mod = Modifier(0);
548 }
549
550 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)551 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
552 const int s, ImmediateValue& imm2)
553 {
554 const int t = s ? 0 : 1;
555 Instruction *insn;
556 Instruction *mul1 = NULL; // mul1 before mul2
557 int e = 0;
558 float f = imm2.reg.data.f32;
559 ImmediateValue imm1;
560
561 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
562
563 if (mul2->getSrc(t)->refCount() == 1) {
564 insn = mul2->getSrc(t)->getInsn();
565 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
566 mul1 = insn;
567 if (mul1 && !mul1->saturate) {
568 int s1;
569
570 if (mul1->src(s1 = 0).getImmediate(imm1) ||
571 mul1->src(s1 = 1).getImmediate(imm1)) {
572 bld.setPosition(mul1, false);
573 // a = mul r, imm1
574 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
575 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
576 mul1->src(s1).mod = Modifier(0);
577 mul2->def(0).replace(mul1->getDef(0), false);
578 } else
579 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
580 // c = mul a, b
581 // d = mul c, imm -> d = mul_x_imm a, b
582 mul1->postFactor = e;
583 mul2->def(0).replace(mul1->getDef(0), false);
584 if (f < 0)
585 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
586 }
587 mul1->saturate = mul2->saturate;
588 return;
589 }
590 }
591 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
592 // b = mul a, imm
593 // d = mul b, c -> d = mul_x_imm a, c
594 int s2, t2;
595 insn = mul2->getDef(0)->uses.front()->getInsn();
596 if (!insn)
597 return;
598 mul1 = mul2;
599 mul2 = NULL;
600 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
601 t2 = s2 ? 0 : 1;
602 if (insn->op == OP_MUL && insn->dType == TYPE_F32)
603 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
604 mul2 = insn;
605 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
606 mul2->postFactor = e;
607 mul2->setSrc(s2, mul1->src(t));
608 if (f < 0)
609 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
610 }
611 }
612 }
613
614 void
opnd(Instruction * i,ImmediateValue & imm0,int s)615 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
616 {
617 const int t = !s;
618 const operation op = i->op;
619
620 switch (i->op) {
621 case OP_MUL:
622 if (i->dType == TYPE_F32)
623 tryCollapseChainedMULs(i, s, imm0);
624
625 if (imm0.isInteger(0)) {
626 i->op = OP_MOV;
627 i->setSrc(0, new_ImmediateValue(prog, 0u));
628 i->src(0).mod = Modifier(0);
629 i->setSrc(1, NULL);
630 } else
631 if (imm0.isInteger(1) || imm0.isInteger(-1)) {
632 if (imm0.isNegative())
633 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
634 i->op = i->src(t).mod.getOp();
635 if (s == 0) {
636 i->setSrc(0, i->getSrc(1));
637 i->src(0).mod = i->src(1).mod;
638 i->src(1).mod = 0;
639 }
640 if (i->op != OP_CVT)
641 i->src(0).mod = 0;
642 i->setSrc(1, NULL);
643 } else
644 if (imm0.isInteger(2) || imm0.isInteger(-2)) {
645 if (imm0.isNegative())
646 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
647 i->op = OP_ADD;
648 i->setSrc(s, i->getSrc(t));
649 i->src(s).mod = i->src(t).mod;
650 } else
651 if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
652 i->op = OP_SHL;
653 imm0.applyLog2();
654 i->setSrc(0, i->getSrc(t));
655 i->src(0).mod = i->src(t).mod;
656 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
657 i->src(1).mod = 0;
658 }
659 break;
660 case OP_ADD:
661 if (imm0.isInteger(0)) {
662 if (s == 0) {
663 i->setSrc(0, i->getSrc(1));
664 i->src(0).mod = i->src(1).mod;
665 }
666 i->setSrc(1, NULL);
667 i->op = i->src(0).mod.getOp();
668 if (i->op != OP_CVT)
669 i->src(0).mod = Modifier(0);
670 }
671 break;
672
673 case OP_DIV:
674 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
675 break;
676 bld.setPosition(i, false);
677 if (imm0.reg.data.u32 == 0) {
678 break;
679 } else
680 if (imm0.reg.data.u32 == 1) {
681 i->op = OP_MOV;
682 i->setSrc(1, NULL);
683 } else
684 if (i->dType == TYPE_U32 && imm0.isPow2()) {
685 i->op = OP_SHR;
686 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
687 } else
688 if (i->dType == TYPE_U32) {
689 Instruction *mul;
690 Value *tA, *tB;
691 const uint32_t d = imm0.reg.data.u32;
692 uint32_t m;
693 int r, s;
694 uint32_t l = util_logbase2(d);
695 if (((uint32_t)1 << l) < d)
696 ++l;
697 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
698 r = l ? 1 : 0;
699 s = l ? (l - 1) : 0;
700
701 tA = bld.getSSA();
702 tB = bld.getSSA();
703 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
704 bld.loadImm(NULL, m));
705 mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
706 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
707 tA = bld.getSSA();
708 if (r)
709 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
710 else
711 tA = tB;
712 tB = s ? bld.getSSA() : i->getDef(0);
713 bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
714 if (s)
715 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
716
717 delete_Instruction(prog, i);
718 } else
719 if (imm0.reg.data.s32 == -1) {
720 i->op = OP_NEG;
721 i->setSrc(1, NULL);
722 } else {
723 LValue *tA, *tB;
724 LValue *tD;
725 const int32_t d = imm0.reg.data.s32;
726 int32_t m;
727 int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
728 if ((1 << l) < abs(d))
729 ++l;
730 if (!l)
731 l = 1;
732 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
733
734 tA = bld.getSSA();
735 tB = bld.getSSA();
736 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
737 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
738 if (l > 1)
739 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
740 else
741 tB = tA;
742 tA = bld.getSSA();
743 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
744 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
745 bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
746 if (d < 0)
747 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
748
749 delete_Instruction(prog, i);
750 }
751 break;
752
753 case OP_MOD:
754 if (i->sType == TYPE_U32 && imm0.isPow2()) {
755 bld.setPosition(i, false);
756 i->op = OP_AND;
757 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
758 }
759 break;
760
761 case OP_SET: // TODO: SET_AND,OR,XOR
762 {
763 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
764 CondCode cc, ccZ;
765 if (i->src(t).mod != Modifier(0))
766 return;
767 if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
768 return;
769 cc = si->setCond;
770 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
771 if (s == 0)
772 ccZ = reverseCondCode(ccZ);
773 switch (ccZ) {
774 case CC_LT: cc = CC_FL; break;
775 case CC_GE: cc = CC_TR; break;
776 case CC_EQ: cc = inverseCondCode(cc); break;
777 case CC_LE: cc = inverseCondCode(cc); break;
778 case CC_GT: break;
779 case CC_NE: break;
780 default:
781 return;
782 }
783 i->asCmp()->setCond = cc;
784 i->setSrc(0, si->src(0));
785 i->setSrc(1, si->src(1));
786 i->sType = si->sType;
787 }
788 break;
789
790 case OP_SHL:
791 {
792 if (s != 1 || i->src(0).mod != Modifier(0))
793 break;
794 // try to concatenate shifts
795 Instruction *si = i->getSrc(0)->getInsn();
796 if (!si || si->op != OP_SHL)
797 break;
798 ImmediateValue imm1;
799 if (si->src(1).getImmediate(imm1)) {
800 bld.setPosition(i, false);
801 i->setSrc(0, si->getSrc(0));
802 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
803 }
804 }
805 break;
806
807 case OP_ABS:
808 case OP_NEG:
809 case OP_LG2:
810 case OP_RCP:
811 case OP_SQRT:
812 case OP_RSQ:
813 case OP_PRESIN:
814 case OP_SIN:
815 case OP_COS:
816 case OP_PREEX2:
817 case OP_EX2:
818 unary(i, imm0);
819 break;
820 default:
821 return;
822 }
823 if (i->op != op)
824 foldCount++;
825 }
826
827 // =============================================================================
828
829 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
830 class ModifierFolding : public Pass
831 {
832 private:
833 virtual bool visit(BasicBlock *);
834 };
835
836 bool
visit(BasicBlock * bb)837 ModifierFolding::visit(BasicBlock *bb)
838 {
839 const Target *target = prog->getTarget();
840
841 Instruction *i, *next, *mi;
842 Modifier mod;
843
844 for (i = bb->getEntry(); i; i = next) {
845 next = i->next;
846
847 if (0 && i->op == OP_SUB) {
848 // turn "sub" into "add neg" (do we really want this ?)
849 i->op = OP_ADD;
850 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
851 }
852
853 for (int s = 0; s < 3 && i->srcExists(s); ++s) {
854 mi = i->getSrc(s)->getInsn();
855 if (!mi ||
856 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
857 continue;
858 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
859 if ((i->op != OP_ADD &&
860 i->op != OP_MUL) ||
861 (mi->op != OP_ABS &&
862 mi->op != OP_NEG))
863 continue;
864 } else
865 if (i->sType != mi->dType) {
866 continue;
867 }
868 if ((mod = Modifier(mi->op)) == Modifier(0))
869 continue;
870 mod *= mi->src(0).mod;
871
872 if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
873 // abs neg [abs] = abs
874 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
875 } else
876 if ((i->op == OP_NEG) && mod.neg()) {
877 assert(s == 0);
878 // neg as both opcode and modifier on same insn is prohibited
879 // neg neg abs = abs, neg neg = identity
880 mod = mod & Modifier(~NV50_IR_MOD_NEG);
881 i->op = mod.getOp();
882 mod = mod & Modifier(~NV50_IR_MOD_ABS);
883 if (mod == Modifier(0))
884 i->op = OP_MOV;
885 }
886
887 if (target->isModSupported(i, s, mod)) {
888 i->setSrc(s, mi->getSrc(0));
889 i->src(s).mod *= mod;
890 }
891 }
892
893 if (i->op == OP_SAT) {
894 mi = i->getSrc(0)->getInsn();
895 if (mi &&
896 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
897 mi->saturate = 1;
898 mi->setDef(0, i->getDef(0));
899 delete_Instruction(prog, i);
900 }
901 }
902 }
903
904 return true;
905 }
906
907 // =============================================================================
908
909 // MUL + ADD -> MAD/FMA
910 // MIN/MAX(a, a) -> a, etc.
911 // SLCT(a, b, const) -> cc(const) ? a : b
912 // RCP(RCP(a)) -> a
913 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
914 class AlgebraicOpt : public Pass
915 {
916 private:
917 virtual bool visit(BasicBlock *);
918
919 void handleABS(Instruction *);
920 bool handleADD(Instruction *);
921 bool tryADDToMADOrSAD(Instruction *, operation toOp);
922 void handleMINMAX(Instruction *);
923 void handleRCP(Instruction *);
924 void handleSLCT(Instruction *);
925 void handleLOGOP(Instruction *);
926 void handleCVT(Instruction *);
927
928 BuildUtil bld;
929 };
930
931 void
handleABS(Instruction * abs)932 AlgebraicOpt::handleABS(Instruction *abs)
933 {
934 Instruction *sub = abs->getSrc(0)->getInsn();
935 DataType ty;
936 if (!sub ||
937 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
938 return;
939 // expect not to have mods yet, if we do, bail
940 if (sub->src(0).mod || sub->src(1).mod)
941 return;
942 // hidden conversion ?
943 ty = intTypeToSigned(sub->dType);
944 if (abs->dType != abs->sType || ty != abs->sType)
945 return;
946
947 if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
948 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
949 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
950 return;
951
952 Value *src0 = sub->getSrc(0);
953 Value *src1 = sub->getSrc(1);
954
955 if (sub->op == OP_ADD) {
956 Instruction *neg = sub->getSrc(1)->getInsn();
957 if (neg && neg->op != OP_NEG) {
958 neg = sub->getSrc(0)->getInsn();
959 src0 = sub->getSrc(1);
960 }
961 if (!neg || neg->op != OP_NEG ||
962 neg->dType != neg->sType || neg->sType != ty)
963 return;
964 src1 = neg->getSrc(0);
965 }
966
967 // found ABS(SUB))
968 abs->moveSources(1, 2); // move sources >=1 up by 2
969 abs->op = OP_SAD;
970 abs->setType(sub->dType);
971 abs->setSrc(0, src0);
972 abs->setSrc(1, src1);
973 bld.setPosition(abs, false);
974 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
975 }
976
977 bool
handleADD(Instruction * add)978 AlgebraicOpt::handleADD(Instruction *add)
979 {
980 Value *src0 = add->getSrc(0);
981 Value *src1 = add->getSrc(1);
982
983 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
984 return false;
985
986 bool changed = false;
987 if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
988 changed = tryADDToMADOrSAD(add, OP_MAD);
989 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
990 changed = tryADDToMADOrSAD(add, OP_SAD);
991 return changed;
992 }
993
994 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
995 // ADD(MUL(a,b), c) -> MAD(a,b,c)
996 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)997 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
998 {
999 Value *src0 = add->getSrc(0);
1000 Value *src1 = add->getSrc(1);
1001 Value *src;
1002 int s;
1003 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1004 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1005 Modifier mod[4];
1006
1007 if (src0->refCount() == 1 &&
1008 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1009 s = 0;
1010 else
1011 if (src1->refCount() == 1 &&
1012 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1013 s = 1;
1014 else
1015 return false;
1016
1017 if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
1018 (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
1019 return false;
1020
1021 src = add->getSrc(s);
1022
1023 if (src->getInsn()->postFactor)
1024 return false;
1025 if (toOp == OP_SAD) {
1026 ImmediateValue imm;
1027 if (!src->getInsn()->src(2).getImmediate(imm))
1028 return false;
1029 if (!imm.isInteger(0))
1030 return false;
1031 }
1032
1033 mod[0] = add->src(0).mod;
1034 mod[1] = add->src(1).mod;
1035 mod[2] = src->getUniqueInsn()->src(0).mod;
1036 mod[3] = src->getUniqueInsn()->src(1).mod;
1037
1038 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1039 return false;
1040
1041 add->op = toOp;
1042 add->subOp = src->getInsn()->subOp; // potentially mul-high
1043
1044 add->setSrc(2, add->src(s ? 0 : 1));
1045
1046 add->setSrc(0, src->getInsn()->getSrc(0));
1047 add->src(0).mod = mod[2] ^ mod[s];
1048 add->setSrc(1, src->getInsn()->getSrc(1));
1049 add->src(1).mod = mod[3];
1050
1051 return true;
1052 }
1053
1054 void
handleMINMAX(Instruction * minmax)1055 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1056 {
1057 Value *src0 = minmax->getSrc(0);
1058 Value *src1 = minmax->getSrc(1);
1059
1060 if (src0 != src1 || src0->reg.file != FILE_GPR)
1061 return;
1062 if (minmax->src(0).mod == minmax->src(1).mod) {
1063 if (minmax->def(0).mayReplace(minmax->src(0))) {
1064 minmax->def(0).replace(minmax->src(0), false);
1065 minmax->bb->remove(minmax);
1066 } else {
1067 minmax->op = OP_CVT;
1068 minmax->setSrc(1, NULL);
1069 }
1070 } else {
1071 // TODO:
1072 // min(x, -x) = -abs(x)
1073 // min(x, -abs(x)) = -abs(x)
1074 // min(x, abs(x)) = x
1075 // max(x, -abs(x)) = x
1076 // max(x, abs(x)) = abs(x)
1077 // max(x, -x) = abs(x)
1078 }
1079 }
1080
1081 void
handleRCP(Instruction * rcp)1082 AlgebraicOpt::handleRCP(Instruction *rcp)
1083 {
1084 Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1085
1086 if (si && si->op == OP_RCP) {
1087 Modifier mod = rcp->src(0).mod * si->src(0).mod;
1088 rcp->op = mod.getOp();
1089 rcp->setSrc(0, si->getSrc(0));
1090 }
1091 }
1092
1093 void
handleSLCT(Instruction * slct)1094 AlgebraicOpt::handleSLCT(Instruction *slct)
1095 {
1096 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1097 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1098 slct->setSrc(0, slct->getSrc(1));
1099 } else
1100 if (slct->getSrc(0) != slct->getSrc(1)) {
1101 return;
1102 }
1103 slct->op = OP_MOV;
1104 slct->setSrc(1, NULL);
1105 slct->setSrc(2, NULL);
1106 }
1107
1108 void
handleLOGOP(Instruction * logop)1109 AlgebraicOpt::handleLOGOP(Instruction *logop)
1110 {
1111 Value *src0 = logop->getSrc(0);
1112 Value *src1 = logop->getSrc(1);
1113
1114 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1115 return;
1116
1117 if (src0 == src1) {
1118 if ((logop->op == OP_AND || logop->op == OP_OR) &&
1119 logop->def(0).mayReplace(logop->src(0))) {
1120 logop->def(0).replace(logop->src(0), false);
1121 delete_Instruction(prog, logop);
1122 }
1123 } else {
1124 // try AND(SET, SET) -> SET_AND(SET)
1125 Instruction *set0 = src0->getInsn();
1126 Instruction *set1 = src1->getInsn();
1127
1128 if (!set0 || set0->fixed || !set1 || set1->fixed)
1129 return;
1130 if (set1->op != OP_SET) {
1131 Instruction *xchg = set0;
1132 set0 = set1;
1133 set1 = xchg;
1134 if (set1->op != OP_SET)
1135 return;
1136 }
1137 operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1138 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1139 if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1140 return;
1141 if (set0->op != OP_SET &&
1142 set0->op != OP_SET_AND &&
1143 set0->op != OP_SET_OR &&
1144 set0->op != OP_SET_XOR)
1145 return;
1146 if (set0->getDef(0)->refCount() > 1 &&
1147 set1->getDef(0)->refCount() > 1)
1148 return;
1149 if (set0->getPredicate() || set1->getPredicate())
1150 return;
1151 // check that they don't source each other
1152 for (int s = 0; s < 2; ++s)
1153 if (set0->getSrc(s) == set1->getDef(0) ||
1154 set1->getSrc(s) == set0->getDef(0))
1155 return;
1156
1157 set0 = cloneForward(func, set0);
1158 set1 = cloneShallow(func, set1);
1159 logop->bb->insertAfter(logop, set1);
1160 logop->bb->insertAfter(logop, set0);
1161
1162 set0->dType = TYPE_U8;
1163 set0->getDef(0)->reg.file = FILE_PREDICATE;
1164 set0->getDef(0)->reg.size = 1;
1165 set1->setSrc(2, set0->getDef(0));
1166 set1->op = redOp;
1167 set1->setDef(0, logop->getDef(0));
1168 delete_Instruction(prog, logop);
1169 }
1170 }
1171
1172 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1173 // nv50:
1174 // F2I(NEG(I2F(ABS(SET))))
1175 void
handleCVT(Instruction * cvt)1176 AlgebraicOpt::handleCVT(Instruction *cvt)
1177 {
1178 if (cvt->sType != TYPE_F32 ||
1179 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1180 return;
1181 Instruction *insn = cvt->getSrc(0)->getInsn();
1182 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1183 return;
1184 if (insn->src(0).mod != Modifier(0))
1185 return;
1186 insn = insn->getSrc(0)->getInsn();
1187
1188 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1189 if (insn && insn->op == OP_CVT &&
1190 insn->dType == TYPE_F32 &&
1191 insn->sType == TYPE_S32) {
1192 insn = insn->getSrc(0)->getInsn();
1193 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1194 insn->src(0).mod)
1195 return;
1196 insn = insn->getSrc(0)->getInsn();
1197 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1198 return;
1199 } else
1200 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1201 return;
1202 }
1203
1204 Instruction *bset = cloneShallow(func, insn);
1205 bset->dType = TYPE_U32;
1206 bset->setDef(0, cvt->getDef(0));
1207 cvt->bb->insertAfter(cvt, bset);
1208 delete_Instruction(prog, cvt);
1209 }
1210
1211 bool
visit(BasicBlock * bb)1212 AlgebraicOpt::visit(BasicBlock *bb)
1213 {
1214 Instruction *next;
1215 for (Instruction *i = bb->getEntry(); i; i = next) {
1216 next = i->next;
1217 switch (i->op) {
1218 case OP_ABS:
1219 handleABS(i);
1220 break;
1221 case OP_ADD:
1222 handleADD(i);
1223 break;
1224 case OP_RCP:
1225 handleRCP(i);
1226 break;
1227 case OP_MIN:
1228 case OP_MAX:
1229 handleMINMAX(i);
1230 break;
1231 case OP_SLCT:
1232 handleSLCT(i);
1233 break;
1234 case OP_AND:
1235 case OP_OR:
1236 case OP_XOR:
1237 handleLOGOP(i);
1238 break;
1239 case OP_CVT:
1240 handleCVT(i);
1241 break;
1242 default:
1243 break;
1244 }
1245 }
1246
1247 return true;
1248 }
1249
1250 // =============================================================================
1251
1252 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)1253 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
1254 {
1255 if (offset != ldst->getSrc(0)->reg.data.offset) {
1256 if (ldst->getSrc(0)->refCount() > 1)
1257 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
1258 ldst->getSrc(0)->reg.data.offset = offset;
1259 }
1260 }
1261
1262 // Combine loads and stores, forward stores to loads where possible.
1263 class MemoryOpt : public Pass
1264 {
1265 private:
1266 class Record
1267 {
1268 public:
1269 Record *next;
1270 Instruction *insn;
1271 const Value *rel[2];
1272 const Value *base;
1273 int32_t offset;
1274 int8_t fileIndex;
1275 uint8_t size;
1276 bool locked;
1277 Record *prev;
1278
1279 bool overlaps(const Instruction *ldst) const;
1280
1281 inline void link(Record **);
1282 inline void unlink(Record **);
1283 inline void set(const Instruction *ldst);
1284 };
1285
1286 public:
1287 MemoryOpt();
1288
1289 Record *loads[DATA_FILE_COUNT];
1290 Record *stores[DATA_FILE_COUNT];
1291
1292 MemoryPool recordPool;
1293
1294 private:
1295 virtual bool visit(BasicBlock *);
1296 bool runOpt(BasicBlock *);
1297
1298 Record **getList(const Instruction *);
1299
1300 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
1301
1302 // merge @insn into load/store instruction from @rec
1303 bool combineLd(Record *rec, Instruction *ld);
1304 bool combineSt(Record *rec, Instruction *st);
1305
1306 bool replaceLdFromLd(Instruction *ld, Record *ldRec);
1307 bool replaceLdFromSt(Instruction *ld, Record *stRec);
1308 bool replaceStFromSt(Instruction *restrict st, Record *stRec);
1309
1310 void addRecord(Instruction *ldst);
1311 void purgeRecords(Instruction *const st, DataFile);
1312 void lockStores(Instruction *const ld);
1313 void reset();
1314
1315 private:
1316 Record *prevRecord;
1317 };
1318
MemoryOpt()1319 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
1320 {
1321 for (int i = 0; i < DATA_FILE_COUNT; ++i) {
1322 loads[i] = NULL;
1323 stores[i] = NULL;
1324 }
1325 prevRecord = NULL;
1326 }
1327
1328 void
reset()1329 MemoryOpt::reset()
1330 {
1331 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
1332 Record *it, *next;
1333 for (it = loads[i]; it; it = next) {
1334 next = it->next;
1335 recordPool.release(it);
1336 }
1337 loads[i] = NULL;
1338 for (it = stores[i]; it; it = next) {
1339 next = it->next;
1340 recordPool.release(it);
1341 }
1342 stores[i] = NULL;
1343 }
1344 }
1345
1346 bool
combineLd(Record * rec,Instruction * ld)1347 MemoryOpt::combineLd(Record *rec, Instruction *ld)
1348 {
1349 int32_t offRc = rec->offset;
1350 int32_t offLd = ld->getSrc(0)->reg.data.offset;
1351 int sizeRc = rec->size;
1352 int sizeLd = typeSizeof(ld->dType);
1353 int size = sizeRc + sizeLd;
1354 int d, j;
1355
1356 if (!prog->getTarget()->
1357 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
1358 return false;
1359 // no unaligned loads
1360 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
1361 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
1362 return false;
1363
1364 assert(sizeRc + sizeLd <= 16 && offRc != offLd);
1365
1366 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
1367
1368 if (offLd < offRc) {
1369 int sz;
1370 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
1371 // d: nr of definitions in ld
1372 // j: nr of definitions in rec->insn, move:
1373 for (d = d + j - 1; j > 0; --j, --d)
1374 rec->insn->setDef(d, rec->insn->getDef(j - 1));
1375
1376 if (rec->insn->getSrc(0)->refCount() > 1)
1377 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
1378 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
1379
1380 d = 0;
1381 } else {
1382 d = j;
1383 }
1384 // move definitions of @ld to @rec->insn
1385 for (j = 0; sizeLd; ++j, ++d) {
1386 sizeLd -= ld->getDef(j)->reg.size;
1387 rec->insn->setDef(d, ld->getDef(j));
1388 }
1389
1390 rec->size = size;
1391 rec->insn->getSrc(0)->reg.size = size;
1392 rec->insn->setType(typeOfSize(size));
1393
1394 delete_Instruction(prog, ld);
1395
1396 return true;
1397 }
1398
1399 bool
combineSt(Record * rec,Instruction * st)1400 MemoryOpt::combineSt(Record *rec, Instruction *st)
1401 {
1402 int32_t offRc = rec->offset;
1403 int32_t offSt = st->getSrc(0)->reg.data.offset;
1404 int sizeRc = rec->size;
1405 int sizeSt = typeSizeof(st->dType);
1406 int s = sizeSt / 4;
1407 int size = sizeRc + sizeSt;
1408 int j, k;
1409 Value *src[4]; // no modifiers in ValueRef allowed for st
1410 Value *extra[3];
1411
1412 if (!prog->getTarget()->
1413 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
1414 return false;
1415 if (size == 8 && MIN2(offRc, offSt) & 0x7)
1416 return false;
1417
1418 st->takeExtraSources(0, extra); // save predicate and indirect address
1419
1420 if (offRc < offSt) {
1421 // save values from @st
1422 for (s = 0; sizeSt; ++s) {
1423 sizeSt -= st->getSrc(s + 1)->reg.size;
1424 src[s] = st->getSrc(s + 1);
1425 }
1426 // set record's values as low sources of @st
1427 for (j = 1; sizeRc; ++j) {
1428 sizeRc -= rec->insn->getSrc(j)->reg.size;
1429 st->setSrc(j, rec->insn->getSrc(j));
1430 }
1431 // set saved values as high sources of @st
1432 for (k = j, j = 0; j < s; ++j)
1433 st->setSrc(k++, src[j]);
1434
1435 updateLdStOffset(st, offRc, func);
1436 } else {
1437 for (j = 1; sizeSt; ++j)
1438 sizeSt -= st->getSrc(j)->reg.size;
1439 for (s = 1; sizeRc; ++j, ++s) {
1440 sizeRc -= rec->insn->getSrc(s)->reg.size;
1441 st->setSrc(j, rec->insn->getSrc(s));
1442 }
1443 rec->offset = offSt;
1444 }
1445 st->putExtraSources(0, extra); // restore pointer and predicate
1446
1447 delete_Instruction(prog, rec->insn);
1448 rec->insn = st;
1449 rec->size = size;
1450 rec->insn->getSrc(0)->reg.size = size;
1451 rec->insn->setType(typeOfSize(size));
1452 return true;
1453 }
1454
1455 void
set(const Instruction * ldst)1456 MemoryOpt::Record::set(const Instruction *ldst)
1457 {
1458 const Symbol *mem = ldst->getSrc(0)->asSym();
1459 fileIndex = mem->reg.fileIndex;
1460 rel[0] = ldst->getIndirect(0, 0);
1461 rel[1] = ldst->getIndirect(0, 1);
1462 offset = mem->reg.data.offset;
1463 base = mem->getBase();
1464 size = typeSizeof(ldst->sType);
1465 }
1466
1467 void
link(Record ** list)1468 MemoryOpt::Record::link(Record **list)
1469 {
1470 next = *list;
1471 if (next)
1472 next->prev = this;
1473 prev = NULL;
1474 *list = this;
1475 }
1476
1477 void
unlink(Record ** list)1478 MemoryOpt::Record::unlink(Record **list)
1479 {
1480 if (next)
1481 next->prev = prev;
1482 if (prev)
1483 prev->next = next;
1484 else
1485 *list = next;
1486 }
1487
1488 MemoryOpt::Record **
getList(const Instruction * insn)1489 MemoryOpt::getList(const Instruction *insn)
1490 {
1491 if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
1492 return &loads[insn->src(0).getFile()];
1493 return &stores[insn->src(0).getFile()];
1494 }
1495
1496 void
addRecord(Instruction * i)1497 MemoryOpt::addRecord(Instruction *i)
1498 {
1499 Record **list = getList(i);
1500 Record *it = reinterpret_cast<Record *>(recordPool.allocate());
1501
1502 it->link(list);
1503 it->set(i);
1504 it->insn = i;
1505 it->locked = false;
1506 }
1507
1508 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const1509 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
1510 {
1511 const Symbol *sym = insn->getSrc(0)->asSym();
1512 const int size = typeSizeof(insn->sType);
1513 Record *rec = NULL;
1514 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
1515
1516 for (; it; it = it->next) {
1517 if (it->locked && insn->op != OP_LOAD)
1518 continue;
1519 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
1520 it->rel[0] != insn->getIndirect(0, 0) ||
1521 it->fileIndex != sym->reg.fileIndex ||
1522 it->rel[1] != insn->getIndirect(0, 1))
1523 continue;
1524
1525 if (it->offset < sym->reg.data.offset) {
1526 if (it->offset + it->size >= sym->reg.data.offset) {
1527 isAdj = (it->offset + it->size == sym->reg.data.offset);
1528 if (!isAdj)
1529 return it;
1530 if (!(it->offset & 0x7))
1531 rec = it;
1532 }
1533 } else {
1534 isAdj = it->offset != sym->reg.data.offset;
1535 if (size <= it->size && !isAdj)
1536 return it;
1537 else
1538 if (!(sym->reg.data.offset & 0x7))
1539 if (it->offset - size <= sym->reg.data.offset)
1540 rec = it;
1541 }
1542 }
1543 return rec;
1544 }
1545
1546 bool
replaceLdFromSt(Instruction * ld,Record * rec)1547 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
1548 {
1549 Instruction *st = rec->insn;
1550 int32_t offSt = rec->offset;
1551 int32_t offLd = ld->getSrc(0)->reg.data.offset;
1552 int d, s;
1553
1554 for (s = 1; offSt != offLd && st->srcExists(s); ++s)
1555 offSt += st->getSrc(s)->reg.size;
1556 if (offSt != offLd)
1557 return false;
1558
1559 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
1560 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
1561 return false;
1562 if (st->getSrc(s)->reg.file != FILE_GPR)
1563 return false;
1564 ld->def(d).replace(st->src(s), false);
1565 }
1566 ld->bb->remove(ld);
1567 return true;
1568 }
1569
1570 bool
replaceLdFromLd(Instruction * ldE,Record * rec)1571 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
1572 {
1573 Instruction *ldR = rec->insn;
1574 int32_t offR = rec->offset;
1575 int32_t offE = ldE->getSrc(0)->reg.data.offset;
1576 int dR, dE;
1577
1578 assert(offR <= offE);
1579 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
1580 offR += ldR->getDef(dR)->reg.size;
1581 if (offR != offE)
1582 return false;
1583
1584 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
1585 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
1586 return false;
1587 ldE->def(dE).replace(ldR->getDef(dR), false);
1588 }
1589
1590 delete_Instruction(prog, ldE);
1591 return true;
1592 }
1593
1594 bool
replaceStFromSt(Instruction * restrict st,Record * rec)1595 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
1596 {
1597 const Instruction *const ri = rec->insn;
1598 Value *extra[3];
1599
1600 int32_t offS = st->getSrc(0)->reg.data.offset;
1601 int32_t offR = rec->offset;
1602 int32_t endS = offS + typeSizeof(st->dType);
1603 int32_t endR = offR + typeSizeof(ri->dType);
1604
1605 rec->size = MAX2(endS, endR) - MIN2(offS, offR);
1606
1607 st->takeExtraSources(0, extra);
1608
1609 if (offR < offS) {
1610 Value *vals[10];
1611 int s, n;
1612 int k = 0;
1613 // get non-replaced sources of ri
1614 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
1615 vals[k++] = ri->getSrc(s);
1616 n = s;
1617 // get replaced sources of st
1618 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
1619 vals[k++] = st->getSrc(s);
1620 // skip replaced sources of ri
1621 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
1622 // get non-replaced sources after values covered by st
1623 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
1624 vals[k++] = ri->getSrc(s);
1625 assert((unsigned int)k <= Elements(vals));
1626 for (s = 0; s < k; ++s)
1627 st->setSrc(s + 1, vals[s]);
1628 st->setSrc(0, ri->getSrc(0));
1629 } else
1630 if (endR > endS) {
1631 int j, s;
1632 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
1633 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
1634 for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
1635 st->setSrc(s++, ri->getSrc(j));
1636 }
1637 st->putExtraSources(0, extra);
1638
1639 delete_Instruction(prog, rec->insn);
1640
1641 rec->insn = st;
1642 rec->offset = st->getSrc(0)->reg.data.offset;
1643
1644 st->setType(typeOfSize(rec->size));
1645
1646 return true;
1647 }
1648
1649 bool
overlaps(const Instruction * ldst) const1650 MemoryOpt::Record::overlaps(const Instruction *ldst) const
1651 {
1652 Record that;
1653 that.set(ldst);
1654
1655 if (this->fileIndex != that.fileIndex)
1656 return false;
1657
1658 if (this->rel[0] || that.rel[0])
1659 return this->base == that.base;
1660 return
1661 (this->offset < that.offset + that.size) &&
1662 (this->offset + this->size > that.offset);
1663 }
1664
1665 // We must not eliminate stores that affect the result of @ld if
1666 // we find later stores to the same location, and we may no longer
1667 // merge them with later stores.
1668 // The stored value can, however, still be used to determine the value
1669 // returned by future loads.
1670 void
lockStores(Instruction * const ld)1671 MemoryOpt::lockStores(Instruction *const ld)
1672 {
1673 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
1674 if (!r->locked && r->overlaps(ld))
1675 r->locked = true;
1676 }
1677
1678 // Prior loads from the location of @st are no longer valid.
1679 // Stores to the location of @st may no longer be used to derive
1680 // the value at it nor be coalesced into later stores.
1681 void
purgeRecords(Instruction * const st,DataFile f)1682 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
1683 {
1684 if (st)
1685 f = st->src(0).getFile();
1686
1687 for (Record *r = loads[f]; r; r = r->next)
1688 if (!st || r->overlaps(st))
1689 r->unlink(&loads[f]);
1690
1691 for (Record *r = stores[f]; r; r = r->next)
1692 if (!st || r->overlaps(st))
1693 r->unlink(&stores[f]);
1694 }
1695
1696 bool
visit(BasicBlock * bb)1697 MemoryOpt::visit(BasicBlock *bb)
1698 {
1699 bool ret = runOpt(bb);
1700 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1701 // where 96 bit memory operations are forbidden.
1702 if (ret)
1703 ret = runOpt(bb);
1704 return ret;
1705 }
1706
1707 bool
runOpt(BasicBlock * bb)1708 MemoryOpt::runOpt(BasicBlock *bb)
1709 {
1710 Instruction *ldst, *next;
1711 Record *rec;
1712 bool isAdjacent = true;
1713
1714 for (ldst = bb->getEntry(); ldst; ldst = next) {
1715 bool keep = true;
1716 bool isLoad = true;
1717 next = ldst->next;
1718
1719 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
1720 if (ldst->isDead()) {
1721 // might have been produced by earlier optimization
1722 delete_Instruction(prog, ldst);
1723 continue;
1724 }
1725 } else
1726 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
1727 isLoad = false;
1728 } else {
1729 // TODO: maybe have all fixed ops act as barrier ?
1730 if (ldst->op == OP_CALL) {
1731 purgeRecords(NULL, FILE_MEMORY_LOCAL);
1732 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
1733 purgeRecords(NULL, FILE_MEMORY_SHARED);
1734 purgeRecords(NULL, FILE_SHADER_OUTPUT);
1735 } else
1736 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
1737 purgeRecords(NULL, FILE_SHADER_OUTPUT);
1738 }
1739 continue;
1740 }
1741 if (ldst->getPredicate()) // TODO: handle predicated ld/st
1742 continue;
1743
1744 if (isLoad) {
1745 DataFile file = ldst->src(0).getFile();
1746
1747 // if ld l[]/g[] look for previous store to eliminate the reload
1748 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
1749 // TODO: shared memory ?
1750 rec = findRecord(ldst, false, isAdjacent);
1751 if (rec && !isAdjacent)
1752 keep = !replaceLdFromSt(ldst, rec);
1753 }
1754
1755 // or look for ld from the same location and replace this one
1756 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
1757 if (rec) {
1758 if (!isAdjacent)
1759 keep = !replaceLdFromLd(ldst, rec);
1760 else
1761 // or combine a previous load with this one
1762 keep = !combineLd(rec, ldst);
1763 }
1764 if (keep)
1765 lockStores(ldst);
1766 } else {
1767 rec = findRecord(ldst, false, isAdjacent);
1768 if (rec) {
1769 if (!isAdjacent)
1770 keep = !replaceStFromSt(ldst, rec);
1771 else
1772 keep = !combineSt(rec, ldst);
1773 }
1774 if (keep)
1775 purgeRecords(ldst, DATA_FILE_COUNT);
1776 }
1777 if (keep)
1778 addRecord(ldst);
1779 }
1780 reset();
1781
1782 return true;
1783 }
1784
1785 // =============================================================================
1786
1787 // Turn control flow into predicated instructions (after register allocation !).
1788 // TODO:
1789 // Could move this to before register allocation on NVC0 and also handle nested
1790 // constructs.
1791 class FlatteningPass : public Pass
1792 {
1793 private:
1794 virtual bool visit(BasicBlock *);
1795
1796 bool tryPredicateConditional(BasicBlock *);
1797 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
1798 void tryPropagateBranch(BasicBlock *);
1799 inline bool isConstantCondition(Value *pred);
1800 inline bool mayPredicate(const Instruction *, const Value *pred) const;
1801 inline void removeFlow(Instruction *);
1802 };
1803
1804 bool
isConstantCondition(Value * pred)1805 FlatteningPass::isConstantCondition(Value *pred)
1806 {
1807 Instruction *insn = pred->getUniqueInsn();
1808 assert(insn);
1809 if (insn->op != OP_SET || insn->srcExists(2))
1810 return false;
1811
1812 for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
1813 Instruction *ld = insn->getSrc(s)->getUniqueInsn();
1814 DataFile file;
1815 if (ld) {
1816 if (ld->op != OP_MOV && ld->op != OP_LOAD)
1817 return false;
1818 if (ld->src(0).isIndirect(0))
1819 return false;
1820 file = ld->src(0).getFile();
1821 } else {
1822 file = insn->src(s).getFile();
1823 // catch $r63 on NVC0
1824 if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
1825 file = FILE_IMMEDIATE;
1826 }
1827 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
1828 return false;
1829 }
1830 return true;
1831 }
1832
1833 void
removeFlow(Instruction * insn)1834 FlatteningPass::removeFlow(Instruction *insn)
1835 {
1836 FlowInstruction *term = insn ? insn->asFlow() : NULL;
1837 if (!term)
1838 return;
1839 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
1840
1841 if (term->op == OP_BRA) {
1842 // TODO: this might get more difficult when we get arbitrary BRAs
1843 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
1844 return;
1845 } else
1846 if (term->op != OP_JOIN)
1847 return;
1848
1849 Value *pred = term->getPredicate();
1850
1851 delete_Instruction(prog, term);
1852
1853 if (pred && pred->refCount() == 0) {
1854 Instruction *pSet = pred->getUniqueInsn();
1855 pred->join->reg.data.id = -1; // deallocate
1856 if (pSet->isDead())
1857 delete_Instruction(prog, pSet);
1858 }
1859 }
1860
1861 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)1862 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
1863 {
1864 for (Instruction *i = bb->getEntry(); i; i = i->next) {
1865 if (i->isNop())
1866 continue;
1867 assert(!i->getPredicate());
1868 i->setPredicate(cc, pred);
1869 }
1870 removeFlow(bb->getExit());
1871 }
1872
1873 bool
mayPredicate(const Instruction * insn,const Value * pred) const1874 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
1875 {
1876 if (insn->isPseudo())
1877 return true;
1878 // TODO: calls where we don't know which registers are modified
1879
1880 if (!prog->getTarget()->mayPredicate(insn, pred))
1881 return false;
1882 for (int d = 0; insn->defExists(d); ++d)
1883 if (insn->getDef(d)->equals(pred))
1884 return false;
1885 return true;
1886 }
1887
1888 // If we conditionally skip over or to a branch instruction, replace it.
1889 // NOTE: We do not update the CFG anymore here !
1890 void
tryPropagateBranch(BasicBlock * bb)1891 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
1892 {
1893 BasicBlock *bf = NULL;
1894 unsigned int i;
1895
1896 if (bb->cfg.outgoingCount() != 2)
1897 return;
1898 if (!bb->getExit() || bb->getExit()->op != OP_BRA)
1899 return;
1900 Graph::EdgeIterator ei = bb->cfg.outgoing();
1901
1902 for (i = 0; !ei.end(); ++i, ei.next()) {
1903 bf = BasicBlock::get(ei.getNode());
1904 if (bf->getInsnCount() == 1)
1905 break;
1906 }
1907 if (ei.end() || !bf->getExit())
1908 return;
1909 FlowInstruction *bra = bb->getExit()->asFlow();
1910 FlowInstruction *rep = bf->getExit()->asFlow();
1911
1912 if (rep->getPredicate())
1913 return;
1914 if (rep->op != OP_BRA &&
1915 rep->op != OP_JOIN &&
1916 rep->op != OP_EXIT)
1917 return;
1918
1919 bra->op = rep->op;
1920 bra->target.bb = rep->target.bb;
1921 if (i) // 2nd out block means branch not taken
1922 bra->cc = inverseCondCode(bra->cc);
1923 bf->remove(rep);
1924 }
1925
1926 bool
visit(BasicBlock * bb)1927 FlatteningPass::visit(BasicBlock *bb)
1928 {
1929 if (tryPredicateConditional(bb))
1930 return true;
1931
1932 // try to attach join to previous instruction
1933 Instruction *insn = bb->getExit();
1934 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
1935 insn = insn->prev;
1936 if (insn && !insn->getPredicate() &&
1937 !insn->asFlow() &&
1938 insn->op != OP_TEXBAR &&
1939 !isTextureOp(insn->op) && // probably just nve4
1940 insn->op != OP_LINTERP && // probably just nve4
1941 insn->op != OP_PINTERP && // probably just nve4
1942 ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
1943 typeSizeof(insn->dType) <= 4) &&
1944 !insn->isNop()) {
1945 insn->join = 1;
1946 bb->remove(bb->getExit());
1947 return true;
1948 }
1949 }
1950
1951 tryPropagateBranch(bb);
1952
1953 return true;
1954 }
1955
1956 bool
tryPredicateConditional(BasicBlock * bb)1957 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
1958 {
1959 BasicBlock *bL = NULL, *bR = NULL;
1960 unsigned int nL = 0, nR = 0, limit = 12;
1961 Instruction *insn;
1962 unsigned int mask;
1963
1964 mask = bb->initiatesSimpleConditional();
1965 if (!mask)
1966 return false;
1967
1968 assert(bb->getExit());
1969 Value *pred = bb->getExit()->getPredicate();
1970 assert(pred);
1971
1972 if (isConstantCondition(pred))
1973 limit = 4;
1974
1975 Graph::EdgeIterator ei = bb->cfg.outgoing();
1976
1977 if (mask & 1) {
1978 bL = BasicBlock::get(ei.getNode());
1979 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
1980 if (!mayPredicate(insn, pred))
1981 return false;
1982 if (nL > limit)
1983 return false; // too long, do a real branch
1984 }
1985 ei.next();
1986
1987 if (mask & 2) {
1988 bR = BasicBlock::get(ei.getNode());
1989 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
1990 if (!mayPredicate(insn, pred))
1991 return false;
1992 if (nR > limit)
1993 return false; // too long, do a real branch
1994 }
1995
1996 if (bL)
1997 predicateInstructions(bL, pred, bb->getExit()->cc);
1998 if (bR)
1999 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
2000
2001 if (bb->joinAt) {
2002 bb->remove(bb->joinAt);
2003 bb->joinAt = NULL;
2004 }
2005 removeFlow(bb->getExit()); // delete the branch/join at the fork point
2006
2007 // remove potential join operations at the end of the conditional
2008 if (prog->getTarget()->joinAnterior) {
2009 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
2010 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
2011 removeFlow(bb->getEntry());
2012 }
2013
2014 return true;
2015 }
2016
2017 // =============================================================================
2018
2019 // Common subexpression elimination. Stupid O^2 implementation.
2020 class LocalCSE : public Pass
2021 {
2022 private:
2023 virtual bool visit(BasicBlock *);
2024
2025 inline bool tryReplace(Instruction **, Instruction *);
2026
2027 DLList ops[OP_LAST + 1];
2028 };
2029
2030 class GlobalCSE : public Pass
2031 {
2032 private:
2033 virtual bool visit(BasicBlock *);
2034 };
2035
2036 bool
isActionEqual(const Instruction * that) const2037 Instruction::isActionEqual(const Instruction *that) const
2038 {
2039 if (this->op != that->op ||
2040 this->dType != that->dType ||
2041 this->sType != that->sType)
2042 return false;
2043 if (this->cc != that->cc)
2044 return false;
2045
2046 if (this->asTex()) {
2047 if (memcmp(&this->asTex()->tex,
2048 &that->asTex()->tex,
2049 sizeof(this->asTex()->tex)))
2050 return false;
2051 } else
2052 if (this->asCmp()) {
2053 if (this->asCmp()->setCond != that->asCmp()->setCond)
2054 return false;
2055 } else
2056 if (this->asFlow()) {
2057 return false;
2058 } else {
2059 if (this->atomic != that->atomic ||
2060 this->ipa != that->ipa ||
2061 this->lanes != that->lanes ||
2062 this->perPatch != that->perPatch)
2063 return false;
2064 if (this->postFactor != that->postFactor)
2065 return false;
2066 }
2067
2068 if (this->subOp != that->subOp ||
2069 this->saturate != that->saturate ||
2070 this->rnd != that->rnd ||
2071 this->ftz != that->ftz ||
2072 this->dnz != that->dnz ||
2073 this->cache != that->cache)
2074 return false;
2075
2076 return true;
2077 }
2078
2079 bool
isResultEqual(const Instruction * that) const2080 Instruction::isResultEqual(const Instruction *that) const
2081 {
2082 unsigned int d, s;
2083
2084 // NOTE: location of discard only affects tex with liveOnly and quadops
2085 if (!this->defExists(0) && this->op != OP_DISCARD)
2086 return false;
2087
2088 if (!isActionEqual(that))
2089 return false;
2090
2091 if (this->predSrc != that->predSrc)
2092 return false;
2093
2094 for (d = 0; this->defExists(d); ++d) {
2095 if (!that->defExists(d) ||
2096 !this->getDef(d)->equals(that->getDef(d), false))
2097 return false;
2098 }
2099 if (that->defExists(d))
2100 return false;
2101
2102 for (s = 0; this->srcExists(s); ++s) {
2103 if (!that->srcExists(s))
2104 return false;
2105 if (this->src(s).mod != that->src(s).mod)
2106 return false;
2107 if (!this->getSrc(s)->equals(that->getSrc(s), true))
2108 return false;
2109 }
2110 if (that->srcExists(s))
2111 return false;
2112
2113 if (op == OP_LOAD || op == OP_VFETCH) {
2114 switch (src(0).getFile()) {
2115 case FILE_MEMORY_CONST:
2116 case FILE_SHADER_INPUT:
2117 return true;
2118 default:
2119 return false;
2120 }
2121 }
2122
2123 return true;
2124 }
2125
2126 // pull through common expressions from different in-blocks
2127 bool
visit(BasicBlock * bb)2128 GlobalCSE::visit(BasicBlock *bb)
2129 {
2130 Instruction *phi, *next, *ik;
2131 int s;
2132
2133 // TODO: maybe do this with OP_UNION, too
2134
2135 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
2136 next = phi->next;
2137 if (phi->getSrc(0)->refCount() > 1)
2138 continue;
2139 ik = phi->getSrc(0)->getInsn();
2140 if (!ik)
2141 continue; // probably a function input
2142 for (s = 1; phi->srcExists(s); ++s) {
2143 if (phi->getSrc(s)->refCount() > 1)
2144 break;
2145 if (!phi->getSrc(s)->getInsn() ||
2146 !phi->getSrc(s)->getInsn()->isResultEqual(ik))
2147 break;
2148 }
2149 if (!phi->srcExists(s)) {
2150 Instruction *entry = bb->getEntry();
2151 ik->bb->remove(ik);
2152 if (!entry || entry->op != OP_JOIN)
2153 bb->insertHead(ik);
2154 else
2155 bb->insertAfter(entry, ik);
2156 ik->setDef(0, phi->getDef(0));
2157 delete_Instruction(prog, phi);
2158 }
2159 }
2160
2161 return true;
2162 }
2163
2164 bool
tryReplace(Instruction ** ptr,Instruction * i)2165 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
2166 {
2167 Instruction *old = *ptr;
2168
2169 // TODO: maybe relax this later (causes trouble with OP_UNION)
2170 if (i->isPredicated())
2171 return false;
2172
2173 if (!old->isResultEqual(i))
2174 return false;
2175
2176 for (int d = 0; old->defExists(d); ++d)
2177 old->def(d).replace(i->getDef(d), false);
2178 delete_Instruction(prog, old);
2179 *ptr = NULL;
2180 return true;
2181 }
2182
2183 bool
visit(BasicBlock * bb)2184 LocalCSE::visit(BasicBlock *bb)
2185 {
2186 unsigned int replaced;
2187
2188 do {
2189 Instruction *ir, *next;
2190
2191 replaced = 0;
2192
2193 // will need to know the order of instructions
2194 int serial = 0;
2195 for (ir = bb->getFirst(); ir; ir = ir->next)
2196 ir->serial = serial++;
2197
2198 for (ir = bb->getEntry(); ir; ir = next) {
2199 int s;
2200 Value *src = NULL;
2201
2202 next = ir->next;
2203
2204 if (ir->fixed) {
2205 ops[ir->op].insert(ir);
2206 continue;
2207 }
2208
2209 for (s = 0; ir->srcExists(s); ++s)
2210 if (ir->getSrc(s)->asLValue())
2211 if (!src || ir->getSrc(s)->refCount() < src->refCount())
2212 src = ir->getSrc(s);
2213
2214 if (src) {
2215 for (Value::UseIterator it = src->uses.begin();
2216 it != src->uses.end(); ++it) {
2217 Instruction *ik = (*it)->getInsn();
2218 if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
2219 if (tryReplace(&ir, ik))
2220 break;
2221 }
2222 } else {
2223 DLLIST_FOR_EACH(&ops[ir->op], iter)
2224 {
2225 Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
2226 if (tryReplace(&ir, ik))
2227 break;
2228 }
2229 }
2230
2231 if (ir)
2232 ops[ir->op].insert(ir);
2233 else
2234 ++replaced;
2235 }
2236 for (unsigned int i = 0; i <= OP_LAST; ++i)
2237 ops[i].clear();
2238
2239 } while (replaced);
2240
2241 return true;
2242 }
2243
2244 // =============================================================================
2245
2246 // Remove computations of unused values.
2247 class DeadCodeElim : public Pass
2248 {
2249 public:
2250 bool buryAll(Program *);
2251
2252 private:
2253 virtual bool visit(BasicBlock *);
2254
2255 void checkSplitLoad(Instruction *ld); // for partially dead loads
2256
2257 unsigned int deadCount;
2258 };
2259
2260 bool
buryAll(Program * prog)2261 DeadCodeElim::buryAll(Program *prog)
2262 {
2263 do {
2264 deadCount = 0;
2265 if (!this->run(prog, false, false))
2266 return false;
2267 } while (deadCount);
2268
2269 return true;
2270 }
2271
2272 bool
visit(BasicBlock * bb)2273 DeadCodeElim::visit(BasicBlock *bb)
2274 {
2275 Instruction *next;
2276
2277 for (Instruction *i = bb->getFirst(); i; i = next) {
2278 next = i->next;
2279 if (i->isDead()) {
2280 ++deadCount;
2281 delete_Instruction(prog, i);
2282 } else
2283 if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
2284 checkSplitLoad(i);
2285 }
2286 }
2287 return true;
2288 }
2289
2290 void
checkSplitLoad(Instruction * ld1)2291 DeadCodeElim::checkSplitLoad(Instruction *ld1)
2292 {
2293 Instruction *ld2 = NULL; // can get at most 2 loads
2294 Value *def1[4];
2295 Value *def2[4];
2296 int32_t addr1, addr2;
2297 int32_t size1, size2;
2298 int d, n1, n2;
2299 uint32_t mask = 0xffffffff;
2300
2301 for (d = 0; ld1->defExists(d); ++d)
2302 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
2303 mask &= ~(1 << d);
2304 if (mask == 0xffffffff)
2305 return;
2306
2307 addr1 = ld1->getSrc(0)->reg.data.offset;
2308 n1 = n2 = 0;
2309 size1 = size2 = 0;
2310 for (d = 0; ld1->defExists(d); ++d) {
2311 if (mask & (1 << d)) {
2312 if (size1 && (addr1 & 0x7))
2313 break;
2314 def1[n1] = ld1->getDef(d);
2315 size1 += def1[n1++]->reg.size;
2316 } else
2317 if (!n1) {
2318 addr1 += ld1->getDef(d)->reg.size;
2319 } else {
2320 break;
2321 }
2322 }
2323 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
2324 if (mask & (1 << d)) {
2325 def2[n2] = ld1->getDef(d);
2326 size2 += def2[n2++]->reg.size;
2327 } else {
2328 assert(!n2);
2329 addr2 += ld1->getDef(d)->reg.size;
2330 }
2331 }
2332
2333 updateLdStOffset(ld1, addr1, func);
2334 ld1->setType(typeOfSize(size1));
2335 for (d = 0; d < 4; ++d)
2336 ld1->setDef(d, (d < n1) ? def1[d] : NULL);
2337
2338 if (!n2)
2339 return;
2340
2341 ld2 = cloneShallow(func, ld1);
2342 updateLdStOffset(ld2, addr2, func);
2343 ld2->setType(typeOfSize(size2));
2344 for (d = 0; d < 4; ++d)
2345 ld2->setDef(d, (d < n2) ? def2[d] : NULL);
2346
2347 ld1->bb->insertAfter(ld1, ld2);
2348 }
2349
2350 // =============================================================================
2351
2352 #define RUN_PASS(l, n, f) \
2353 if (level >= (l)) { \
2354 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
2355 INFO("PEEPHOLE: %s\n", #n); \
2356 n pass; \
2357 if (!pass.f(this)) \
2358 return false; \
2359 }
2360
2361 bool
optimizeSSA(int level)2362 Program::optimizeSSA(int level)
2363 {
2364 RUN_PASS(1, DeadCodeElim, buryAll);
2365 RUN_PASS(1, CopyPropagation, run);
2366 RUN_PASS(2, GlobalCSE, run);
2367 RUN_PASS(1, LocalCSE, run);
2368 RUN_PASS(2, AlgebraicOpt, run);
2369 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
2370 RUN_PASS(1, ConstantFolding, foldAll);
2371 RUN_PASS(1, LoadPropagation, run);
2372 RUN_PASS(2, MemoryOpt, run);
2373 RUN_PASS(2, LocalCSE, run);
2374 RUN_PASS(0, DeadCodeElim, buryAll);
2375
2376 return true;
2377 }
2378
2379 bool
optimizePostRA(int level)2380 Program::optimizePostRA(int level)
2381 {
2382 RUN_PASS(2, FlatteningPass, run);
2383 return true;
2384 }
2385
2386 }
2387