1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
23 #include "nv50_ir.h"
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
26 
27 extern "C" {
28 #include "util/u_math.h"
29 }
30 
31 namespace nv50_ir {
32 
33 bool
isNop() const34 Instruction::isNop() const
35 {
36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37       return true;
38    if (terminator || join) // XXX: should terminator imply flow ?
39       return false;
40    if (!fixed && op == OP_NOP)
41       return true;
42 
43    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
44       for (int d = 1; defExists(d); ++d)
45          if (def(d).rep()->reg.data.id >= 0)
46             WARN("part of vector result is unused !\n");
47       return true;
48    }
49 
50    if (op == OP_MOV || op == OP_UNION) {
51       if (!getDef(0)->equals(getSrc(0)))
52          return false;
53       if (op == OP_UNION)
54          if (!def(0).rep()->equals(getSrc(1)))
55             return false;
56       return true;
57    }
58 
59    return false;
60 }
61 
isDead() const62 bool Instruction::isDead() const
63 {
64    if (op == OP_STORE ||
65        op == OP_EXPORT ||
66        op == OP_WRSV)
67       return false;
68 
69    for (int d = 0; defExists(d); ++d)
70       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
71          return false;
72 
73    if (terminator || asFlow())
74       return false;
75    if (fixed)
76       return false;
77 
78    return true;
79 };
80 
81 // =============================================================================
82 
83 class CopyPropagation : public Pass
84 {
85 private:
86    virtual bool visit(BasicBlock *);
87 };
88 
89 // Propagate all MOVs forward to make subsequent optimization easier, except if
90 // the sources stem from a phi, in which case we don't want to mess up potential
91 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92 bool
visit(BasicBlock * bb)93 CopyPropagation::visit(BasicBlock *bb)
94 {
95    Instruction *mov, *si, *next;
96 
97    for (mov = bb->getEntry(); mov; mov = next) {
98       next = mov->next;
99       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
100          continue;
101       if (mov->getPredicate())
102          continue;
103       if (mov->def(0).getFile() != mov->src(0).getFile())
104          continue;
105       si = mov->getSrc(0)->getInsn();
106       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
107          // propagate
108          mov->def(0).replace(mov->getSrc(0), false);
109          delete_Instruction(prog, mov);
110       }
111    }
112    return true;
113 }
114 
115 // =============================================================================
116 
117 class LoadPropagation : public Pass
118 {
119 private:
120    virtual bool visit(BasicBlock *);
121 
122    void checkSwapSrc01(Instruction *);
123 
124    bool isCSpaceLoad(Instruction *);
125    bool isImmd32Load(Instruction *);
126    bool isAttribOrSharedLoad(Instruction *);
127 };
128 
129 bool
isCSpaceLoad(Instruction * ld)130 LoadPropagation::isCSpaceLoad(Instruction *ld)
131 {
132    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
133 }
134 
135 bool
isImmd32Load(Instruction * ld)136 LoadPropagation::isImmd32Load(Instruction *ld)
137 {
138    if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
139       return false;
140    return ld->src(0).getFile() == FILE_IMMEDIATE;
141 }
142 
143 bool
isAttribOrSharedLoad(Instruction * ld)144 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
145 {
146    return ld &&
147       (ld->op == OP_VFETCH ||
148        (ld->op == OP_LOAD &&
149         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
150          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
151 }
152 
153 void
checkSwapSrc01(Instruction * insn)154 LoadPropagation::checkSwapSrc01(Instruction *insn)
155 {
156    if (!prog->getTarget()->getOpInfo(insn).commutative)
157       if (insn->op != OP_SET && insn->op != OP_SLCT)
158          return;
159    if (insn->src(1).getFile() != FILE_GPR)
160       return;
161 
162    Instruction *i0 = insn->getSrc(0)->getInsn();
163    Instruction *i1 = insn->getSrc(1)->getInsn();
164 
165    if (isCSpaceLoad(i0)) {
166       if (!isCSpaceLoad(i1))
167          insn->swapSources(0, 1);
168       else
169          return;
170    } else
171    if (isImmd32Load(i0)) {
172       if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
173          insn->swapSources(0, 1);
174       else
175          return;
176    } else
177    if (isAttribOrSharedLoad(i1)) {
178       if (!isAttribOrSharedLoad(i0))
179          insn->swapSources(0, 1);
180       else
181          return;
182    } else {
183       return;
184    }
185 
186    if (insn->op == OP_SET)
187       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
188    else
189    if (insn->op == OP_SLCT)
190       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
191 }
192 
193 bool
visit(BasicBlock * bb)194 LoadPropagation::visit(BasicBlock *bb)
195 {
196    const Target *targ = prog->getTarget();
197    Instruction *next;
198 
199    for (Instruction *i = bb->getEntry(); i; i = next) {
200       next = i->next;
201 
202       if (i->srcExists(1))
203          checkSwapSrc01(i);
204 
205       for (int s = 0; i->srcExists(s); ++s) {
206          Instruction *ld = i->getSrc(s)->getInsn();
207 
208          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
209             continue;
210          if (!targ->insnCanLoad(i, s, ld))
211             continue;
212 
213          // propagate !
214          i->setSrc(s, ld->getSrc(0));
215          if (ld->src(0).isIndirect(0))
216             i->setIndirect(s, 0, ld->getIndirect(0, 0));
217 
218          if (ld->getDef(0)->refCount() == 0)
219             delete_Instruction(prog, ld);
220       }
221    }
222    return true;
223 }
224 
225 // =============================================================================
226 
227 // Evaluate constant expressions.
228 class ConstantFolding : public Pass
229 {
230 public:
231    bool foldAll(Program *);
232 
233 private:
234    virtual bool visit(BasicBlock *);
235 
236    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
237    void opnd(Instruction *, ImmediateValue&, int s);
238 
239    void unary(Instruction *, const ImmediateValue&);
240 
241    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
242 
243    // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
244    CmpInstruction *findOriginForTestWithZero(Value *);
245 
246    unsigned int foldCount;
247 
248    BuildUtil bld;
249 };
250 
251 // TODO: remember generated immediates and only revisit these
252 bool
foldAll(Program * prog)253 ConstantFolding::foldAll(Program *prog)
254 {
255    unsigned int iterCount = 0;
256    do {
257       foldCount = 0;
258       if (!run(prog))
259          return false;
260    } while (foldCount && ++iterCount < 2);
261    return true;
262 }
263 
264 bool
visit(BasicBlock * bb)265 ConstantFolding::visit(BasicBlock *bb)
266 {
267    Instruction *i, *next;
268 
269    for (i = bb->getEntry(); i; i = next) {
270       next = i->next;
271       if (i->op == OP_MOV || i->op == OP_CALL)
272          continue;
273 
274       ImmediateValue src0, src1;
275 
276       if (i->srcExists(1) &&
277           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
278          expr(i, src0, src1);
279       else
280       if (i->srcExists(0) && i->src(0).getImmediate(src0))
281          opnd(i, src0, 0);
282       else
283       if (i->srcExists(1) && i->src(1).getImmediate(src1))
284          opnd(i, src1, 1);
285    }
286    return true;
287 }
288 
289 CmpInstruction *
findOriginForTestWithZero(Value * value)290 ConstantFolding::findOriginForTestWithZero(Value *value)
291 {
292    if (!value)
293       return NULL;
294    Instruction *insn = value->getInsn();
295 
296    while (insn && insn->op != OP_SET) {
297       Instruction *next = NULL;
298       switch (insn->op) {
299       case OP_NEG:
300       case OP_ABS:
301       case OP_CVT:
302          next = insn->getSrc(0)->getInsn();
303          if (insn->sType != next->dType)
304             return NULL;
305          break;
306       case OP_MOV:
307          next = insn->getSrc(0)->getInsn();
308          break;
309       default:
310          return NULL;
311       }
312       insn = next;
313    }
314    return insn ? insn->asCmp() : NULL;
315 }
316 
317 void
applyTo(ImmediateValue & imm) const318 Modifier::applyTo(ImmediateValue& imm) const
319 {
320    switch (imm.reg.type) {
321    case TYPE_F32:
322       if (bits & NV50_IR_MOD_ABS)
323          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
324       if (bits & NV50_IR_MOD_NEG)
325          imm.reg.data.f32 = -imm.reg.data.f32;
326       if (bits & NV50_IR_MOD_SAT) {
327          if (imm.reg.data.f32 < 0.0f)
328             imm.reg.data.f32 = 0.0f;
329          else
330          if (imm.reg.data.f32 > 1.0f)
331             imm.reg.data.f32 = 1.0f;
332       }
333       assert(!(bits & NV50_IR_MOD_NOT));
334       break;
335 
336    case TYPE_S8: // NOTE: will be extended
337    case TYPE_S16:
338    case TYPE_S32:
339    case TYPE_U8: // NOTE: treated as signed
340    case TYPE_U16:
341    case TYPE_U32:
342       if (bits & NV50_IR_MOD_ABS)
343          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
344             imm.reg.data.s32 : -imm.reg.data.s32;
345       if (bits & NV50_IR_MOD_NEG)
346          imm.reg.data.s32 = -imm.reg.data.s32;
347       if (bits & NV50_IR_MOD_NOT)
348          imm.reg.data.s32 = ~imm.reg.data.s32;
349       break;
350 
351    case TYPE_F64:
352       if (bits & NV50_IR_MOD_ABS)
353          imm.reg.data.f64 = fabs(imm.reg.data.f64);
354       if (bits & NV50_IR_MOD_NEG)
355          imm.reg.data.f64 = -imm.reg.data.f64;
356       if (bits & NV50_IR_MOD_SAT) {
357          if (imm.reg.data.f64 < 0.0)
358             imm.reg.data.f64 = 0.0;
359          else
360          if (imm.reg.data.f64 > 1.0)
361             imm.reg.data.f64 = 1.0;
362       }
363       assert(!(bits & NV50_IR_MOD_NOT));
364       break;
365 
366    default:
367       assert(!"invalid/unhandled type");
368       imm.reg.data.u64 = 0;
369       break;
370    }
371 }
372 
373 operation
getOp() const374 Modifier::getOp() const
375 {
376    switch (bits) {
377    case NV50_IR_MOD_ABS: return OP_ABS;
378    case NV50_IR_MOD_NEG: return OP_NEG;
379    case NV50_IR_MOD_SAT: return OP_SAT;
380    case NV50_IR_MOD_NOT: return OP_NOT;
381    case 0:
382       return OP_MOV;
383    default:
384       return OP_CVT;
385    }
386 }
387 
388 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)389 ConstantFolding::expr(Instruction *i,
390                       ImmediateValue &imm0, ImmediateValue &imm1)
391 {
392    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
393    struct Storage res;
394 
395    memset(&res.data, 0, sizeof(res.data));
396 
397    switch (i->op) {
398    case OP_MAD:
399    case OP_FMA:
400    case OP_MUL:
401       if (i->dnz && i->dType == TYPE_F32) {
402          if (!isfinite(a->data.f32))
403             a->data.f32 = 0.0f;
404          if (!isfinite(b->data.f32))
405             b->data.f32 = 0.0f;
406       }
407       switch (i->dType) {
408       case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
409       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
410       case TYPE_S32:
411       case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
412       default:
413          return;
414       }
415       break;
416    case OP_DIV:
417       if (b->data.u32 == 0)
418          break;
419       switch (i->dType) {
420       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
421       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
422       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
423       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
424       default:
425          return;
426       }
427       break;
428    case OP_ADD:
429       switch (i->dType) {
430       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
431       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
432       case TYPE_S32:
433       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
434       default:
435          return;
436       }
437       break;
438    case OP_POW:
439       switch (i->dType) {
440       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
441       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
442       default:
443          return;
444       }
445       break;
446    case OP_MAX:
447       switch (i->dType) {
448       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
449       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
450       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
451       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
452       default:
453          return;
454       }
455       break;
456    case OP_MIN:
457       switch (i->dType) {
458       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
459       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
460       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
461       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
462       default:
463          return;
464       }
465       break;
466    case OP_AND:
467       res.data.u64 = a->data.u64 & b->data.u64;
468       break;
469    case OP_OR:
470       res.data.u64 = a->data.u64 | b->data.u64;
471       break;
472    case OP_XOR:
473       res.data.u64 = a->data.u64 ^ b->data.u64;
474       break;
475    case OP_SHL:
476       res.data.u32 = a->data.u32 << b->data.u32;
477       break;
478    case OP_SHR:
479       switch (i->dType) {
480       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
481       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
482       default:
483          return;
484       }
485       break;
486    case OP_SLCT:
487       if (a->data.u32 != b->data.u32)
488          return;
489       res.data.u32 = a->data.u32;
490       break;
491    default:
492       return;
493    }
494    ++foldCount;
495 
496    i->src(0).mod = Modifier(0);
497    i->src(1).mod = Modifier(0);
498 
499    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
500    i->setSrc(1, NULL);
501 
502    i->getSrc(0)->reg.data = res.data;
503 
504    if (i->op == OP_MAD || i->op == OP_FMA) {
505       i->op = OP_ADD;
506 
507       i->setSrc(1, i->getSrc(0));
508       i->src(1).mod = i->src(2).mod;
509       i->setSrc(0, i->getSrc(2));
510       i->setSrc(2, NULL);
511 
512       ImmediateValue src0;
513       if (i->src(0).getImmediate(src0))
514          expr(i, src0, *i->getSrc(1)->asImm());
515    } else {
516       i->op = OP_MOV;
517    }
518 }
519 
520 void
unary(Instruction * i,const ImmediateValue & imm)521 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
522 {
523    Storage res;
524 
525    if (i->dType != TYPE_F32)
526       return;
527    switch (i->op) {
528    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
529    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
530    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
531    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
532    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
533    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
534    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
535    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
536    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
537    case OP_PRESIN:
538    case OP_PREEX2:
539       // these should be handled in subsequent OP_SIN/COS/EX2
540       res.data.f32 = imm.reg.data.f32;
541       break;
542    default:
543       return;
544    }
545    i->op = OP_MOV;
546    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
547    i->src(0).mod = Modifier(0);
548 }
549 
550 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)551 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
552                                         const int s, ImmediateValue& imm2)
553 {
554    const int t = s ? 0 : 1;
555    Instruction *insn;
556    Instruction *mul1 = NULL; // mul1 before mul2
557    int e = 0;
558    float f = imm2.reg.data.f32;
559    ImmediateValue imm1;
560 
561    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
562 
563    if (mul2->getSrc(t)->refCount() == 1) {
564       insn = mul2->getSrc(t)->getInsn();
565       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
566          mul1 = insn;
567       if (mul1 && !mul1->saturate) {
568          int s1;
569 
570          if (mul1->src(s1 = 0).getImmediate(imm1) ||
571              mul1->src(s1 = 1).getImmediate(imm1)) {
572             bld.setPosition(mul1, false);
573             // a = mul r, imm1
574             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
575             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
576             mul1->src(s1).mod = Modifier(0);
577             mul2->def(0).replace(mul1->getDef(0), false);
578          } else
579          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
580             // c = mul a, b
581             // d = mul c, imm   -> d = mul_x_imm a, b
582             mul1->postFactor = e;
583             mul2->def(0).replace(mul1->getDef(0), false);
584             if (f < 0)
585                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
586          }
587          mul1->saturate = mul2->saturate;
588          return;
589       }
590    }
591    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
592       // b = mul a, imm
593       // d = mul b, c   -> d = mul_x_imm a, c
594       int s2, t2;
595       insn = mul2->getDef(0)->uses.front()->getInsn();
596       if (!insn)
597          return;
598       mul1 = mul2;
599       mul2 = NULL;
600       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
601       t2 = s2 ? 0 : 1;
602       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
603          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
604             mul2 = insn;
605       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
606          mul2->postFactor = e;
607          mul2->setSrc(s2, mul1->src(t));
608          if (f < 0)
609             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
610       }
611    }
612 }
613 
614 void
opnd(Instruction * i,ImmediateValue & imm0,int s)615 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
616 {
617    const int t = !s;
618    const operation op = i->op;
619 
620    switch (i->op) {
621    case OP_MUL:
622       if (i->dType == TYPE_F32)
623          tryCollapseChainedMULs(i, s, imm0);
624 
625       if (imm0.isInteger(0)) {
626          i->op = OP_MOV;
627          i->setSrc(0, new_ImmediateValue(prog, 0u));
628          i->src(0).mod = Modifier(0);
629          i->setSrc(1, NULL);
630       } else
631       if (imm0.isInteger(1) || imm0.isInteger(-1)) {
632          if (imm0.isNegative())
633             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
634          i->op = i->src(t).mod.getOp();
635          if (s == 0) {
636             i->setSrc(0, i->getSrc(1));
637             i->src(0).mod = i->src(1).mod;
638             i->src(1).mod = 0;
639          }
640          if (i->op != OP_CVT)
641             i->src(0).mod = 0;
642          i->setSrc(1, NULL);
643       } else
644       if (imm0.isInteger(2) || imm0.isInteger(-2)) {
645          if (imm0.isNegative())
646             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
647          i->op = OP_ADD;
648          i->setSrc(s, i->getSrc(t));
649          i->src(s).mod = i->src(t).mod;
650       } else
651       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
652          i->op = OP_SHL;
653          imm0.applyLog2();
654          i->setSrc(0, i->getSrc(t));
655          i->src(0).mod = i->src(t).mod;
656          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
657          i->src(1).mod = 0;
658       }
659       break;
660    case OP_ADD:
661       if (imm0.isInteger(0)) {
662          if (s == 0) {
663             i->setSrc(0, i->getSrc(1));
664             i->src(0).mod = i->src(1).mod;
665          }
666          i->setSrc(1, NULL);
667          i->op = i->src(0).mod.getOp();
668          if (i->op != OP_CVT)
669             i->src(0).mod = Modifier(0);
670       }
671       break;
672 
673    case OP_DIV:
674       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
675          break;
676       bld.setPosition(i, false);
677       if (imm0.reg.data.u32 == 0) {
678          break;
679       } else
680       if (imm0.reg.data.u32 == 1) {
681          i->op = OP_MOV;
682          i->setSrc(1, NULL);
683       } else
684       if (i->dType == TYPE_U32 && imm0.isPow2()) {
685          i->op = OP_SHR;
686          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
687       } else
688       if (i->dType == TYPE_U32) {
689          Instruction *mul;
690          Value *tA, *tB;
691          const uint32_t d = imm0.reg.data.u32;
692          uint32_t m;
693          int r, s;
694          uint32_t l = util_logbase2(d);
695          if (((uint32_t)1 << l) < d)
696             ++l;
697          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
698          r = l ? 1 : 0;
699          s = l ? (l - 1) : 0;
700 
701          tA = bld.getSSA();
702          tB = bld.getSSA();
703          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
704                          bld.loadImm(NULL, m));
705          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
706          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
707          tA = bld.getSSA();
708          if (r)
709             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
710          else
711             tA = tB;
712          tB = s ? bld.getSSA() : i->getDef(0);
713          bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
714          if (s)
715             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
716 
717          delete_Instruction(prog, i);
718       } else
719       if (imm0.reg.data.s32 == -1) {
720          i->op = OP_NEG;
721          i->setSrc(1, NULL);
722       } else {
723          LValue *tA, *tB;
724          LValue *tD;
725          const int32_t d = imm0.reg.data.s32;
726          int32_t m;
727          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
728          if ((1 << l) < abs(d))
729             ++l;
730          if (!l)
731             l = 1;
732          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
733 
734          tA = bld.getSSA();
735          tB = bld.getSSA();
736          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
737                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
738          if (l > 1)
739             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
740          else
741             tB = tA;
742          tA = bld.getSSA();
743          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
744          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
745          bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
746          if (d < 0)
747             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
748 
749          delete_Instruction(prog, i);
750       }
751       break;
752 
753    case OP_MOD:
754       if (i->sType == TYPE_U32 && imm0.isPow2()) {
755          bld.setPosition(i, false);
756          i->op = OP_AND;
757          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
758       }
759       break;
760 
761    case OP_SET: // TODO: SET_AND,OR,XOR
762    {
763       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
764       CondCode cc, ccZ;
765       if (i->src(t).mod != Modifier(0))
766          return;
767       if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
768          return;
769       cc = si->setCond;
770       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
771       if (s == 0)
772          ccZ = reverseCondCode(ccZ);
773       switch (ccZ) {
774       case CC_LT: cc = CC_FL; break;
775       case CC_GE: cc = CC_TR; break;
776       case CC_EQ: cc = inverseCondCode(cc); break;
777       case CC_LE: cc = inverseCondCode(cc); break;
778       case CC_GT: break;
779       case CC_NE: break;
780       default:
781          return;
782       }
783       i->asCmp()->setCond = cc;
784       i->setSrc(0, si->src(0));
785       i->setSrc(1, si->src(1));
786       i->sType = si->sType;
787    }
788       break;
789 
790    case OP_SHL:
791    {
792       if (s != 1 || i->src(0).mod != Modifier(0))
793          break;
794       // try to concatenate shifts
795       Instruction *si = i->getSrc(0)->getInsn();
796       if (!si || si->op != OP_SHL)
797          break;
798       ImmediateValue imm1;
799       if (si->src(1).getImmediate(imm1)) {
800          bld.setPosition(i, false);
801          i->setSrc(0, si->getSrc(0));
802          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
803       }
804    }
805       break;
806 
807    case OP_ABS:
808    case OP_NEG:
809    case OP_LG2:
810    case OP_RCP:
811    case OP_SQRT:
812    case OP_RSQ:
813    case OP_PRESIN:
814    case OP_SIN:
815    case OP_COS:
816    case OP_PREEX2:
817    case OP_EX2:
818       unary(i, imm0);
819       break;
820    default:
821       return;
822    }
823    if (i->op != op)
824       foldCount++;
825 }
826 
827 // =============================================================================
828 
829 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
830 class ModifierFolding : public Pass
831 {
832 private:
833    virtual bool visit(BasicBlock *);
834 };
835 
836 bool
visit(BasicBlock * bb)837 ModifierFolding::visit(BasicBlock *bb)
838 {
839    const Target *target = prog->getTarget();
840 
841    Instruction *i, *next, *mi;
842    Modifier mod;
843 
844    for (i = bb->getEntry(); i; i = next) {
845       next = i->next;
846 
847       if (0 && i->op == OP_SUB) {
848          // turn "sub" into "add neg" (do we really want this ?)
849          i->op = OP_ADD;
850          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
851       }
852 
853       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
854          mi = i->getSrc(s)->getInsn();
855          if (!mi ||
856              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
857             continue;
858          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
859             if ((i->op != OP_ADD &&
860                  i->op != OP_MUL) ||
861                 (mi->op != OP_ABS &&
862                  mi->op != OP_NEG))
863                continue;
864          } else
865          if (i->sType != mi->dType) {
866             continue;
867          }
868          if ((mod = Modifier(mi->op)) == Modifier(0))
869             continue;
870          mod *= mi->src(0).mod;
871 
872          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
873             // abs neg [abs] = abs
874             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
875          } else
876          if ((i->op == OP_NEG) && mod.neg()) {
877             assert(s == 0);
878             // neg as both opcode and modifier on same insn is prohibited
879             // neg neg abs = abs, neg neg = identity
880             mod = mod & Modifier(~NV50_IR_MOD_NEG);
881             i->op = mod.getOp();
882             mod = mod & Modifier(~NV50_IR_MOD_ABS);
883             if (mod == Modifier(0))
884                i->op = OP_MOV;
885          }
886 
887          if (target->isModSupported(i, s, mod)) {
888             i->setSrc(s, mi->getSrc(0));
889             i->src(s).mod *= mod;
890          }
891       }
892 
893       if (i->op == OP_SAT) {
894          mi = i->getSrc(0)->getInsn();
895          if (mi &&
896              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
897             mi->saturate = 1;
898             mi->setDef(0, i->getDef(0));
899             delete_Instruction(prog, i);
900          }
901       }
902    }
903 
904    return true;
905 }
906 
907 // =============================================================================
908 
909 // MUL + ADD -> MAD/FMA
910 // MIN/MAX(a, a) -> a, etc.
911 // SLCT(a, b, const) -> cc(const) ? a : b
912 // RCP(RCP(a)) -> a
913 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
914 class AlgebraicOpt : public Pass
915 {
916 private:
917    virtual bool visit(BasicBlock *);
918 
919    void handleABS(Instruction *);
920    bool handleADD(Instruction *);
921    bool tryADDToMADOrSAD(Instruction *, operation toOp);
922    void handleMINMAX(Instruction *);
923    void handleRCP(Instruction *);
924    void handleSLCT(Instruction *);
925    void handleLOGOP(Instruction *);
926    void handleCVT(Instruction *);
927 
928    BuildUtil bld;
929 };
930 
931 void
handleABS(Instruction * abs)932 AlgebraicOpt::handleABS(Instruction *abs)
933 {
934    Instruction *sub = abs->getSrc(0)->getInsn();
935    DataType ty;
936    if (!sub ||
937        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
938       return;
939    // expect not to have mods yet, if we do, bail
940    if (sub->src(0).mod || sub->src(1).mod)
941       return;
942    // hidden conversion ?
943    ty = intTypeToSigned(sub->dType);
944    if (abs->dType != abs->sType || ty != abs->sType)
945       return;
946 
947    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
948        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
949        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
950          return;
951 
952    Value *src0 = sub->getSrc(0);
953    Value *src1 = sub->getSrc(1);
954 
955    if (sub->op == OP_ADD) {
956       Instruction *neg = sub->getSrc(1)->getInsn();
957       if (neg && neg->op != OP_NEG) {
958          neg = sub->getSrc(0)->getInsn();
959          src0 = sub->getSrc(1);
960       }
961       if (!neg || neg->op != OP_NEG ||
962           neg->dType != neg->sType || neg->sType != ty)
963          return;
964       src1 = neg->getSrc(0);
965    }
966 
967    // found ABS(SUB))
968    abs->moveSources(1, 2); // move sources >=1 up by 2
969    abs->op = OP_SAD;
970    abs->setType(sub->dType);
971    abs->setSrc(0, src0);
972    abs->setSrc(1, src1);
973    bld.setPosition(abs, false);
974    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
975 }
976 
977 bool
handleADD(Instruction * add)978 AlgebraicOpt::handleADD(Instruction *add)
979 {
980    Value *src0 = add->getSrc(0);
981    Value *src1 = add->getSrc(1);
982 
983    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
984       return false;
985 
986    bool changed = false;
987    if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
988       changed = tryADDToMADOrSAD(add, OP_MAD);
989    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
990       changed = tryADDToMADOrSAD(add, OP_SAD);
991    return changed;
992 }
993 
994 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
995 // ADD(MUL(a,b), c) -> MAD(a,b,c)
996 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)997 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
998 {
999    Value *src0 = add->getSrc(0);
1000    Value *src1 = add->getSrc(1);
1001    Value *src;
1002    int s;
1003    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1004    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1005    Modifier mod[4];
1006 
1007    if (src0->refCount() == 1 &&
1008        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1009       s = 0;
1010    else
1011    if (src1->refCount() == 1 &&
1012        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1013       s = 1;
1014    else
1015       return false;
1016 
1017    if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
1018        (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
1019       return false;
1020 
1021    src = add->getSrc(s);
1022 
1023    if (src->getInsn()->postFactor)
1024       return false;
1025    if (toOp == OP_SAD) {
1026       ImmediateValue imm;
1027       if (!src->getInsn()->src(2).getImmediate(imm))
1028          return false;
1029       if (!imm.isInteger(0))
1030          return false;
1031    }
1032 
1033    mod[0] = add->src(0).mod;
1034    mod[1] = add->src(1).mod;
1035    mod[2] = src->getUniqueInsn()->src(0).mod;
1036    mod[3] = src->getUniqueInsn()->src(1).mod;
1037 
1038    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1039       return false;
1040 
1041    add->op = toOp;
1042    add->subOp = src->getInsn()->subOp; // potentially mul-high
1043 
1044    add->setSrc(2, add->src(s ? 0 : 1));
1045 
1046    add->setSrc(0, src->getInsn()->getSrc(0));
1047    add->src(0).mod = mod[2] ^ mod[s];
1048    add->setSrc(1, src->getInsn()->getSrc(1));
1049    add->src(1).mod = mod[3];
1050 
1051    return true;
1052 }
1053 
1054 void
handleMINMAX(Instruction * minmax)1055 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1056 {
1057    Value *src0 = minmax->getSrc(0);
1058    Value *src1 = minmax->getSrc(1);
1059 
1060    if (src0 != src1 || src0->reg.file != FILE_GPR)
1061       return;
1062    if (minmax->src(0).mod == minmax->src(1).mod) {
1063       if (minmax->def(0).mayReplace(minmax->src(0))) {
1064          minmax->def(0).replace(minmax->src(0), false);
1065          minmax->bb->remove(minmax);
1066       } else {
1067          minmax->op = OP_CVT;
1068          minmax->setSrc(1, NULL);
1069       }
1070    } else {
1071       // TODO:
1072       // min(x, -x) = -abs(x)
1073       // min(x, -abs(x)) = -abs(x)
1074       // min(x, abs(x)) = x
1075       // max(x, -abs(x)) = x
1076       // max(x, abs(x)) = abs(x)
1077       // max(x, -x) = abs(x)
1078    }
1079 }
1080 
1081 void
handleRCP(Instruction * rcp)1082 AlgebraicOpt::handleRCP(Instruction *rcp)
1083 {
1084    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1085 
1086    if (si && si->op == OP_RCP) {
1087       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1088       rcp->op = mod.getOp();
1089       rcp->setSrc(0, si->getSrc(0));
1090    }
1091 }
1092 
1093 void
handleSLCT(Instruction * slct)1094 AlgebraicOpt::handleSLCT(Instruction *slct)
1095 {
1096    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1097       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1098          slct->setSrc(0, slct->getSrc(1));
1099    } else
1100    if (slct->getSrc(0) != slct->getSrc(1)) {
1101       return;
1102    }
1103    slct->op = OP_MOV;
1104    slct->setSrc(1, NULL);
1105    slct->setSrc(2, NULL);
1106 }
1107 
1108 void
handleLOGOP(Instruction * logop)1109 AlgebraicOpt::handleLOGOP(Instruction *logop)
1110 {
1111    Value *src0 = logop->getSrc(0);
1112    Value *src1 = logop->getSrc(1);
1113 
1114    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1115       return;
1116 
1117    if (src0 == src1) {
1118       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1119           logop->def(0).mayReplace(logop->src(0))) {
1120          logop->def(0).replace(logop->src(0), false);
1121          delete_Instruction(prog, logop);
1122       }
1123    } else {
1124       // try AND(SET, SET) -> SET_AND(SET)
1125       Instruction *set0 = src0->getInsn();
1126       Instruction *set1 = src1->getInsn();
1127 
1128       if (!set0 || set0->fixed || !set1 || set1->fixed)
1129          return;
1130       if (set1->op != OP_SET) {
1131          Instruction *xchg = set0;
1132          set0 = set1;
1133          set1 = xchg;
1134          if (set1->op != OP_SET)
1135             return;
1136       }
1137       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1138                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1139       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1140          return;
1141       if (set0->op != OP_SET &&
1142           set0->op != OP_SET_AND &&
1143           set0->op != OP_SET_OR &&
1144           set0->op != OP_SET_XOR)
1145          return;
1146       if (set0->getDef(0)->refCount() > 1 &&
1147           set1->getDef(0)->refCount() > 1)
1148          return;
1149       if (set0->getPredicate() || set1->getPredicate())
1150          return;
1151       // check that they don't source each other
1152       for (int s = 0; s < 2; ++s)
1153          if (set0->getSrc(s) == set1->getDef(0) ||
1154              set1->getSrc(s) == set0->getDef(0))
1155             return;
1156 
1157       set0 = cloneForward(func, set0);
1158       set1 = cloneShallow(func, set1);
1159       logop->bb->insertAfter(logop, set1);
1160       logop->bb->insertAfter(logop, set0);
1161 
1162       set0->dType = TYPE_U8;
1163       set0->getDef(0)->reg.file = FILE_PREDICATE;
1164       set0->getDef(0)->reg.size = 1;
1165       set1->setSrc(2, set0->getDef(0));
1166       set1->op = redOp;
1167       set1->setDef(0, logop->getDef(0));
1168       delete_Instruction(prog, logop);
1169    }
1170 }
1171 
1172 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1173 // nv50:
1174 //  F2I(NEG(I2F(ABS(SET))))
1175 void
handleCVT(Instruction * cvt)1176 AlgebraicOpt::handleCVT(Instruction *cvt)
1177 {
1178    if (cvt->sType != TYPE_F32 ||
1179        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1180       return;
1181    Instruction *insn = cvt->getSrc(0)->getInsn();
1182    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1183       return;
1184    if (insn->src(0).mod != Modifier(0))
1185       return;
1186    insn = insn->getSrc(0)->getInsn();
1187 
1188    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1189    if (insn && insn->op == OP_CVT &&
1190        insn->dType == TYPE_F32 &&
1191        insn->sType == TYPE_S32) {
1192       insn = insn->getSrc(0)->getInsn();
1193       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1194           insn->src(0).mod)
1195          return;
1196       insn = insn->getSrc(0)->getInsn();
1197       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1198          return;
1199    } else
1200    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1201       return;
1202    }
1203 
1204    Instruction *bset = cloneShallow(func, insn);
1205    bset->dType = TYPE_U32;
1206    bset->setDef(0, cvt->getDef(0));
1207    cvt->bb->insertAfter(cvt, bset);
1208    delete_Instruction(prog, cvt);
1209 }
1210 
1211 bool
visit(BasicBlock * bb)1212 AlgebraicOpt::visit(BasicBlock *bb)
1213 {
1214    Instruction *next;
1215    for (Instruction *i = bb->getEntry(); i; i = next) {
1216       next = i->next;
1217       switch (i->op) {
1218       case OP_ABS:
1219          handleABS(i);
1220          break;
1221       case OP_ADD:
1222          handleADD(i);
1223          break;
1224       case OP_RCP:
1225          handleRCP(i);
1226          break;
1227       case OP_MIN:
1228       case OP_MAX:
1229          handleMINMAX(i);
1230          break;
1231       case OP_SLCT:
1232          handleSLCT(i);
1233          break;
1234       case OP_AND:
1235       case OP_OR:
1236       case OP_XOR:
1237          handleLOGOP(i);
1238          break;
1239       case OP_CVT:
1240          handleCVT(i);
1241          break;
1242       default:
1243          break;
1244       }
1245    }
1246 
1247    return true;
1248 }
1249 
1250 // =============================================================================
1251 
1252 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)1253 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
1254 {
1255    if (offset != ldst->getSrc(0)->reg.data.offset) {
1256       if (ldst->getSrc(0)->refCount() > 1)
1257          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
1258       ldst->getSrc(0)->reg.data.offset = offset;
1259    }
1260 }
1261 
1262 // Combine loads and stores, forward stores to loads where possible.
1263 class MemoryOpt : public Pass
1264 {
1265 private:
1266    class Record
1267    {
1268    public:
1269       Record *next;
1270       Instruction *insn;
1271       const Value *rel[2];
1272       const Value *base;
1273       int32_t offset;
1274       int8_t fileIndex;
1275       uint8_t size;
1276       bool locked;
1277       Record *prev;
1278 
1279       bool overlaps(const Instruction *ldst) const;
1280 
1281       inline void link(Record **);
1282       inline void unlink(Record **);
1283       inline void set(const Instruction *ldst);
1284    };
1285 
1286 public:
1287    MemoryOpt();
1288 
1289    Record *loads[DATA_FILE_COUNT];
1290    Record *stores[DATA_FILE_COUNT];
1291 
1292    MemoryPool recordPool;
1293 
1294 private:
1295    virtual bool visit(BasicBlock *);
1296    bool runOpt(BasicBlock *);
1297 
1298    Record **getList(const Instruction *);
1299 
1300    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
1301 
1302    // merge @insn into load/store instruction from @rec
1303    bool combineLd(Record *rec, Instruction *ld);
1304    bool combineSt(Record *rec, Instruction *st);
1305 
1306    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
1307    bool replaceLdFromSt(Instruction *ld, Record *stRec);
1308    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
1309 
1310    void addRecord(Instruction *ldst);
1311    void purgeRecords(Instruction *const st, DataFile);
1312    void lockStores(Instruction *const ld);
1313    void reset();
1314 
1315 private:
1316    Record *prevRecord;
1317 };
1318 
MemoryOpt()1319 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
1320 {
1321    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
1322       loads[i] = NULL;
1323       stores[i] = NULL;
1324    }
1325    prevRecord = NULL;
1326 }
1327 
1328 void
reset()1329 MemoryOpt::reset()
1330 {
1331    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
1332       Record *it, *next;
1333       for (it = loads[i]; it; it = next) {
1334          next = it->next;
1335          recordPool.release(it);
1336       }
1337       loads[i] = NULL;
1338       for (it = stores[i]; it; it = next) {
1339          next = it->next;
1340          recordPool.release(it);
1341       }
1342       stores[i] = NULL;
1343    }
1344 }
1345 
1346 bool
combineLd(Record * rec,Instruction * ld)1347 MemoryOpt::combineLd(Record *rec, Instruction *ld)
1348 {
1349    int32_t offRc = rec->offset;
1350    int32_t offLd = ld->getSrc(0)->reg.data.offset;
1351    int sizeRc = rec->size;
1352    int sizeLd = typeSizeof(ld->dType);
1353    int size = sizeRc + sizeLd;
1354    int d, j;
1355 
1356    if (!prog->getTarget()->
1357        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
1358       return false;
1359    // no unaligned loads
1360    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
1361        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
1362       return false;
1363 
1364    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
1365 
1366    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
1367 
1368    if (offLd < offRc) {
1369       int sz;
1370       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
1371       // d: nr of definitions in ld
1372       // j: nr of definitions in rec->insn, move:
1373       for (d = d + j - 1; j > 0; --j, --d)
1374          rec->insn->setDef(d, rec->insn->getDef(j - 1));
1375 
1376       if (rec->insn->getSrc(0)->refCount() > 1)
1377          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
1378       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
1379 
1380       d = 0;
1381    } else {
1382       d = j;
1383    }
1384    // move definitions of @ld to @rec->insn
1385    for (j = 0; sizeLd; ++j, ++d) {
1386       sizeLd -= ld->getDef(j)->reg.size;
1387       rec->insn->setDef(d, ld->getDef(j));
1388    }
1389 
1390    rec->size = size;
1391    rec->insn->getSrc(0)->reg.size = size;
1392    rec->insn->setType(typeOfSize(size));
1393 
1394    delete_Instruction(prog, ld);
1395 
1396    return true;
1397 }
1398 
1399 bool
combineSt(Record * rec,Instruction * st)1400 MemoryOpt::combineSt(Record *rec, Instruction *st)
1401 {
1402    int32_t offRc = rec->offset;
1403    int32_t offSt = st->getSrc(0)->reg.data.offset;
1404    int sizeRc = rec->size;
1405    int sizeSt = typeSizeof(st->dType);
1406    int s = sizeSt / 4;
1407    int size = sizeRc + sizeSt;
1408    int j, k;
1409    Value *src[4]; // no modifiers in ValueRef allowed for st
1410    Value *extra[3];
1411 
1412    if (!prog->getTarget()->
1413        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
1414       return false;
1415    if (size == 8 && MIN2(offRc, offSt) & 0x7)
1416       return false;
1417 
1418    st->takeExtraSources(0, extra); // save predicate and indirect address
1419 
1420    if (offRc < offSt) {
1421       // save values from @st
1422       for (s = 0; sizeSt; ++s) {
1423          sizeSt -= st->getSrc(s + 1)->reg.size;
1424          src[s] = st->getSrc(s + 1);
1425       }
1426       // set record's values as low sources of @st
1427       for (j = 1; sizeRc; ++j) {
1428          sizeRc -= rec->insn->getSrc(j)->reg.size;
1429          st->setSrc(j, rec->insn->getSrc(j));
1430       }
1431       // set saved values as high sources of @st
1432       for (k = j, j = 0; j < s; ++j)
1433          st->setSrc(k++, src[j]);
1434 
1435       updateLdStOffset(st, offRc, func);
1436    } else {
1437       for (j = 1; sizeSt; ++j)
1438          sizeSt -= st->getSrc(j)->reg.size;
1439       for (s = 1; sizeRc; ++j, ++s) {
1440          sizeRc -= rec->insn->getSrc(s)->reg.size;
1441          st->setSrc(j, rec->insn->getSrc(s));
1442       }
1443       rec->offset = offSt;
1444    }
1445    st->putExtraSources(0, extra); // restore pointer and predicate
1446 
1447    delete_Instruction(prog, rec->insn);
1448    rec->insn = st;
1449    rec->size = size;
1450    rec->insn->getSrc(0)->reg.size = size;
1451    rec->insn->setType(typeOfSize(size));
1452    return true;
1453 }
1454 
1455 void
set(const Instruction * ldst)1456 MemoryOpt::Record::set(const Instruction *ldst)
1457 {
1458    const Symbol *mem = ldst->getSrc(0)->asSym();
1459    fileIndex = mem->reg.fileIndex;
1460    rel[0] = ldst->getIndirect(0, 0);
1461    rel[1] = ldst->getIndirect(0, 1);
1462    offset = mem->reg.data.offset;
1463    base = mem->getBase();
1464    size = typeSizeof(ldst->sType);
1465 }
1466 
1467 void
link(Record ** list)1468 MemoryOpt::Record::link(Record **list)
1469 {
1470    next = *list;
1471    if (next)
1472       next->prev = this;
1473    prev = NULL;
1474    *list = this;
1475 }
1476 
1477 void
unlink(Record ** list)1478 MemoryOpt::Record::unlink(Record **list)
1479 {
1480    if (next)
1481       next->prev = prev;
1482    if (prev)
1483       prev->next = next;
1484    else
1485       *list = next;
1486 }
1487 
1488 MemoryOpt::Record **
getList(const Instruction * insn)1489 MemoryOpt::getList(const Instruction *insn)
1490 {
1491    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
1492       return &loads[insn->src(0).getFile()];
1493    return &stores[insn->src(0).getFile()];
1494 }
1495 
1496 void
addRecord(Instruction * i)1497 MemoryOpt::addRecord(Instruction *i)
1498 {
1499    Record **list = getList(i);
1500    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
1501 
1502    it->link(list);
1503    it->set(i);
1504    it->insn = i;
1505    it->locked = false;
1506 }
1507 
1508 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const1509 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
1510 {
1511    const Symbol *sym = insn->getSrc(0)->asSym();
1512    const int size = typeSizeof(insn->sType);
1513    Record *rec = NULL;
1514    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
1515 
1516    for (; it; it = it->next) {
1517       if (it->locked && insn->op != OP_LOAD)
1518          continue;
1519       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
1520           it->rel[0] != insn->getIndirect(0, 0) ||
1521           it->fileIndex != sym->reg.fileIndex ||
1522           it->rel[1] != insn->getIndirect(0, 1))
1523          continue;
1524 
1525       if (it->offset < sym->reg.data.offset) {
1526          if (it->offset + it->size >= sym->reg.data.offset) {
1527             isAdj = (it->offset + it->size == sym->reg.data.offset);
1528             if (!isAdj)
1529                return it;
1530             if (!(it->offset & 0x7))
1531                rec = it;
1532          }
1533       } else {
1534          isAdj = it->offset != sym->reg.data.offset;
1535          if (size <= it->size && !isAdj)
1536             return it;
1537          else
1538          if (!(sym->reg.data.offset & 0x7))
1539             if (it->offset - size <= sym->reg.data.offset)
1540                rec = it;
1541       }
1542    }
1543    return rec;
1544 }
1545 
1546 bool
replaceLdFromSt(Instruction * ld,Record * rec)1547 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
1548 {
1549    Instruction *st = rec->insn;
1550    int32_t offSt = rec->offset;
1551    int32_t offLd = ld->getSrc(0)->reg.data.offset;
1552    int d, s;
1553 
1554    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
1555       offSt += st->getSrc(s)->reg.size;
1556    if (offSt != offLd)
1557       return false;
1558 
1559    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
1560       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
1561          return false;
1562       if (st->getSrc(s)->reg.file != FILE_GPR)
1563          return false;
1564       ld->def(d).replace(st->src(s), false);
1565    }
1566    ld->bb->remove(ld);
1567    return true;
1568 }
1569 
1570 bool
replaceLdFromLd(Instruction * ldE,Record * rec)1571 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
1572 {
1573    Instruction *ldR = rec->insn;
1574    int32_t offR = rec->offset;
1575    int32_t offE = ldE->getSrc(0)->reg.data.offset;
1576    int dR, dE;
1577 
1578    assert(offR <= offE);
1579    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
1580       offR += ldR->getDef(dR)->reg.size;
1581    if (offR != offE)
1582       return false;
1583 
1584    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
1585       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
1586          return false;
1587       ldE->def(dE).replace(ldR->getDef(dR), false);
1588    }
1589 
1590    delete_Instruction(prog, ldE);
1591    return true;
1592 }
1593 
1594 bool
replaceStFromSt(Instruction * restrict st,Record * rec)1595 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
1596 {
1597    const Instruction *const ri = rec->insn;
1598    Value *extra[3];
1599 
1600    int32_t offS = st->getSrc(0)->reg.data.offset;
1601    int32_t offR = rec->offset;
1602    int32_t endS = offS + typeSizeof(st->dType);
1603    int32_t endR = offR + typeSizeof(ri->dType);
1604 
1605    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
1606 
1607    st->takeExtraSources(0, extra);
1608 
1609    if (offR < offS) {
1610       Value *vals[10];
1611       int s, n;
1612       int k = 0;
1613       // get non-replaced sources of ri
1614       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
1615          vals[k++] = ri->getSrc(s);
1616       n = s;
1617       // get replaced sources of st
1618       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
1619          vals[k++] = st->getSrc(s);
1620       // skip replaced sources of ri
1621       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
1622       // get non-replaced sources after values covered by st
1623       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
1624          vals[k++] = ri->getSrc(s);
1625       assert((unsigned int)k <= Elements(vals));
1626       for (s = 0; s < k; ++s)
1627          st->setSrc(s + 1, vals[s]);
1628       st->setSrc(0, ri->getSrc(0));
1629    } else
1630    if (endR > endS) {
1631       int j, s;
1632       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
1633       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
1634       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
1635          st->setSrc(s++, ri->getSrc(j));
1636    }
1637    st->putExtraSources(0, extra);
1638 
1639    delete_Instruction(prog, rec->insn);
1640 
1641    rec->insn = st;
1642    rec->offset = st->getSrc(0)->reg.data.offset;
1643 
1644    st->setType(typeOfSize(rec->size));
1645 
1646    return true;
1647 }
1648 
1649 bool
overlaps(const Instruction * ldst) const1650 MemoryOpt::Record::overlaps(const Instruction *ldst) const
1651 {
1652    Record that;
1653    that.set(ldst);
1654 
1655    if (this->fileIndex != that.fileIndex)
1656       return false;
1657 
1658    if (this->rel[0] || that.rel[0])
1659       return this->base == that.base;
1660    return
1661       (this->offset < that.offset + that.size) &&
1662       (this->offset + this->size > that.offset);
1663 }
1664 
1665 // We must not eliminate stores that affect the result of @ld if
1666 // we find later stores to the same location, and we may no longer
1667 // merge them with later stores.
1668 // The stored value can, however, still be used to determine the value
1669 // returned by future loads.
1670 void
lockStores(Instruction * const ld)1671 MemoryOpt::lockStores(Instruction *const ld)
1672 {
1673    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
1674       if (!r->locked && r->overlaps(ld))
1675          r->locked = true;
1676 }
1677 
1678 // Prior loads from the location of @st are no longer valid.
1679 // Stores to the location of @st may no longer be used to derive
1680 // the value at it nor be coalesced into later stores.
1681 void
purgeRecords(Instruction * const st,DataFile f)1682 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
1683 {
1684    if (st)
1685       f = st->src(0).getFile();
1686 
1687    for (Record *r = loads[f]; r; r = r->next)
1688       if (!st || r->overlaps(st))
1689          r->unlink(&loads[f]);
1690 
1691    for (Record *r = stores[f]; r; r = r->next)
1692       if (!st || r->overlaps(st))
1693          r->unlink(&stores[f]);
1694 }
1695 
1696 bool
visit(BasicBlock * bb)1697 MemoryOpt::visit(BasicBlock *bb)
1698 {
1699    bool ret = runOpt(bb);
1700    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1701    // where 96 bit memory operations are forbidden.
1702    if (ret)
1703       ret = runOpt(bb);
1704    return ret;
1705 }
1706 
1707 bool
runOpt(BasicBlock * bb)1708 MemoryOpt::runOpt(BasicBlock *bb)
1709 {
1710    Instruction *ldst, *next;
1711    Record *rec;
1712    bool isAdjacent = true;
1713 
1714    for (ldst = bb->getEntry(); ldst; ldst = next) {
1715       bool keep = true;
1716       bool isLoad = true;
1717       next = ldst->next;
1718 
1719       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
1720          if (ldst->isDead()) {
1721             // might have been produced by earlier optimization
1722             delete_Instruction(prog, ldst);
1723             continue;
1724          }
1725       } else
1726       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
1727          isLoad = false;
1728       } else {
1729          // TODO: maybe have all fixed ops act as barrier ?
1730          if (ldst->op == OP_CALL) {
1731             purgeRecords(NULL, FILE_MEMORY_LOCAL);
1732             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
1733             purgeRecords(NULL, FILE_MEMORY_SHARED);
1734             purgeRecords(NULL, FILE_SHADER_OUTPUT);
1735          } else
1736          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
1737             purgeRecords(NULL, FILE_SHADER_OUTPUT);
1738          }
1739          continue;
1740       }
1741       if (ldst->getPredicate()) // TODO: handle predicated ld/st
1742          continue;
1743 
1744       if (isLoad) {
1745          DataFile file = ldst->src(0).getFile();
1746 
1747          // if ld l[]/g[] look for previous store to eliminate the reload
1748          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
1749             // TODO: shared memory ?
1750             rec = findRecord(ldst, false, isAdjacent);
1751             if (rec && !isAdjacent)
1752                keep = !replaceLdFromSt(ldst, rec);
1753          }
1754 
1755          // or look for ld from the same location and replace this one
1756          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
1757          if (rec) {
1758             if (!isAdjacent)
1759                keep = !replaceLdFromLd(ldst, rec);
1760             else
1761                // or combine a previous load with this one
1762                keep = !combineLd(rec, ldst);
1763          }
1764          if (keep)
1765             lockStores(ldst);
1766       } else {
1767          rec = findRecord(ldst, false, isAdjacent);
1768          if (rec) {
1769             if (!isAdjacent)
1770                keep = !replaceStFromSt(ldst, rec);
1771             else
1772                keep = !combineSt(rec, ldst);
1773          }
1774          if (keep)
1775             purgeRecords(ldst, DATA_FILE_COUNT);
1776       }
1777       if (keep)
1778          addRecord(ldst);
1779    }
1780    reset();
1781 
1782    return true;
1783 }
1784 
1785 // =============================================================================
1786 
1787 // Turn control flow into predicated instructions (after register allocation !).
1788 // TODO:
1789 // Could move this to before register allocation on NVC0 and also handle nested
1790 // constructs.
1791 class FlatteningPass : public Pass
1792 {
1793 private:
1794    virtual bool visit(BasicBlock *);
1795 
1796    bool tryPredicateConditional(BasicBlock *);
1797    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
1798    void tryPropagateBranch(BasicBlock *);
1799    inline bool isConstantCondition(Value *pred);
1800    inline bool mayPredicate(const Instruction *, const Value *pred) const;
1801    inline void removeFlow(Instruction *);
1802 };
1803 
1804 bool
isConstantCondition(Value * pred)1805 FlatteningPass::isConstantCondition(Value *pred)
1806 {
1807    Instruction *insn = pred->getUniqueInsn();
1808    assert(insn);
1809    if (insn->op != OP_SET || insn->srcExists(2))
1810       return false;
1811 
1812    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
1813       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
1814       DataFile file;
1815       if (ld) {
1816          if (ld->op != OP_MOV && ld->op != OP_LOAD)
1817             return false;
1818          if (ld->src(0).isIndirect(0))
1819             return false;
1820          file = ld->src(0).getFile();
1821       } else {
1822          file = insn->src(s).getFile();
1823          // catch $r63 on NVC0
1824          if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
1825             file = FILE_IMMEDIATE;
1826       }
1827       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
1828          return false;
1829    }
1830    return true;
1831 }
1832 
1833 void
removeFlow(Instruction * insn)1834 FlatteningPass::removeFlow(Instruction *insn)
1835 {
1836    FlowInstruction *term = insn ? insn->asFlow() : NULL;
1837    if (!term)
1838       return;
1839    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
1840 
1841    if (term->op == OP_BRA) {
1842       // TODO: this might get more difficult when we get arbitrary BRAs
1843       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
1844          return;
1845    } else
1846    if (term->op != OP_JOIN)
1847       return;
1848 
1849    Value *pred = term->getPredicate();
1850 
1851    delete_Instruction(prog, term);
1852 
1853    if (pred && pred->refCount() == 0) {
1854       Instruction *pSet = pred->getUniqueInsn();
1855       pred->join->reg.data.id = -1; // deallocate
1856       if (pSet->isDead())
1857          delete_Instruction(prog, pSet);
1858    }
1859 }
1860 
1861 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)1862 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
1863 {
1864    for (Instruction *i = bb->getEntry(); i; i = i->next) {
1865       if (i->isNop())
1866          continue;
1867       assert(!i->getPredicate());
1868       i->setPredicate(cc, pred);
1869    }
1870    removeFlow(bb->getExit());
1871 }
1872 
1873 bool
mayPredicate(const Instruction * insn,const Value * pred) const1874 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
1875 {
1876    if (insn->isPseudo())
1877       return true;
1878    // TODO: calls where we don't know which registers are modified
1879 
1880    if (!prog->getTarget()->mayPredicate(insn, pred))
1881       return false;
1882    for (int d = 0; insn->defExists(d); ++d)
1883       if (insn->getDef(d)->equals(pred))
1884          return false;
1885    return true;
1886 }
1887 
1888 // If we conditionally skip over or to a branch instruction, replace it.
1889 // NOTE: We do not update the CFG anymore here !
1890 void
tryPropagateBranch(BasicBlock * bb)1891 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
1892 {
1893    BasicBlock *bf = NULL;
1894    unsigned int i;
1895 
1896    if (bb->cfg.outgoingCount() != 2)
1897       return;
1898    if (!bb->getExit() || bb->getExit()->op != OP_BRA)
1899       return;
1900    Graph::EdgeIterator ei = bb->cfg.outgoing();
1901 
1902    for (i = 0; !ei.end(); ++i, ei.next()) {
1903       bf = BasicBlock::get(ei.getNode());
1904       if (bf->getInsnCount() == 1)
1905          break;
1906    }
1907    if (ei.end() || !bf->getExit())
1908       return;
1909    FlowInstruction *bra = bb->getExit()->asFlow();
1910    FlowInstruction *rep = bf->getExit()->asFlow();
1911 
1912    if (rep->getPredicate())
1913       return;
1914    if (rep->op != OP_BRA &&
1915        rep->op != OP_JOIN &&
1916        rep->op != OP_EXIT)
1917       return;
1918 
1919    bra->op = rep->op;
1920    bra->target.bb = rep->target.bb;
1921    if (i) // 2nd out block means branch not taken
1922       bra->cc = inverseCondCode(bra->cc);
1923    bf->remove(rep);
1924 }
1925 
1926 bool
visit(BasicBlock * bb)1927 FlatteningPass::visit(BasicBlock *bb)
1928 {
1929    if (tryPredicateConditional(bb))
1930       return true;
1931 
1932    // try to attach join to previous instruction
1933    Instruction *insn = bb->getExit();
1934    if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
1935       insn = insn->prev;
1936       if (insn && !insn->getPredicate() &&
1937           !insn->asFlow() &&
1938           insn->op != OP_TEXBAR &&
1939           !isTextureOp(insn->op) && // probably just nve4
1940           insn->op != OP_LINTERP && // probably just nve4
1941           insn->op != OP_PINTERP && // probably just nve4
1942           ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
1943            typeSizeof(insn->dType) <= 4) &&
1944           !insn->isNop()) {
1945          insn->join = 1;
1946          bb->remove(bb->getExit());
1947          return true;
1948       }
1949    }
1950 
1951    tryPropagateBranch(bb);
1952 
1953    return true;
1954 }
1955 
1956 bool
tryPredicateConditional(BasicBlock * bb)1957 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
1958 {
1959    BasicBlock *bL = NULL, *bR = NULL;
1960    unsigned int nL = 0, nR = 0, limit = 12;
1961    Instruction *insn;
1962    unsigned int mask;
1963 
1964    mask = bb->initiatesSimpleConditional();
1965    if (!mask)
1966       return false;
1967 
1968    assert(bb->getExit());
1969    Value *pred = bb->getExit()->getPredicate();
1970    assert(pred);
1971 
1972    if (isConstantCondition(pred))
1973       limit = 4;
1974 
1975    Graph::EdgeIterator ei = bb->cfg.outgoing();
1976 
1977    if (mask & 1) {
1978       bL = BasicBlock::get(ei.getNode());
1979       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
1980          if (!mayPredicate(insn, pred))
1981             return false;
1982       if (nL > limit)
1983          return false; // too long, do a real branch
1984    }
1985    ei.next();
1986 
1987    if (mask & 2) {
1988       bR = BasicBlock::get(ei.getNode());
1989       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
1990          if (!mayPredicate(insn, pred))
1991             return false;
1992       if (nR > limit)
1993          return false; // too long, do a real branch
1994    }
1995 
1996    if (bL)
1997       predicateInstructions(bL, pred, bb->getExit()->cc);
1998    if (bR)
1999       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
2000 
2001    if (bb->joinAt) {
2002       bb->remove(bb->joinAt);
2003       bb->joinAt = NULL;
2004    }
2005    removeFlow(bb->getExit()); // delete the branch/join at the fork point
2006 
2007    // remove potential join operations at the end of the conditional
2008    if (prog->getTarget()->joinAnterior) {
2009       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
2010       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
2011          removeFlow(bb->getEntry());
2012    }
2013 
2014    return true;
2015 }
2016 
2017 // =============================================================================
2018 
2019 // Common subexpression elimination. Stupid O^2 implementation.
2020 class LocalCSE : public Pass
2021 {
2022 private:
2023    virtual bool visit(BasicBlock *);
2024 
2025    inline bool tryReplace(Instruction **, Instruction *);
2026 
2027    DLList ops[OP_LAST + 1];
2028 };
2029 
2030 class GlobalCSE : public Pass
2031 {
2032 private:
2033    virtual bool visit(BasicBlock *);
2034 };
2035 
2036 bool
isActionEqual(const Instruction * that) const2037 Instruction::isActionEqual(const Instruction *that) const
2038 {
2039    if (this->op != that->op ||
2040        this->dType != that->dType ||
2041        this->sType != that->sType)
2042       return false;
2043    if (this->cc != that->cc)
2044       return false;
2045 
2046    if (this->asTex()) {
2047       if (memcmp(&this->asTex()->tex,
2048                  &that->asTex()->tex,
2049                  sizeof(this->asTex()->tex)))
2050          return false;
2051    } else
2052    if (this->asCmp()) {
2053       if (this->asCmp()->setCond != that->asCmp()->setCond)
2054          return false;
2055    } else
2056    if (this->asFlow()) {
2057       return false;
2058    } else {
2059       if (this->atomic != that->atomic ||
2060           this->ipa != that->ipa ||
2061           this->lanes != that->lanes ||
2062           this->perPatch != that->perPatch)
2063          return false;
2064       if (this->postFactor != that->postFactor)
2065          return false;
2066    }
2067 
2068    if (this->subOp != that->subOp ||
2069        this->saturate != that->saturate ||
2070        this->rnd != that->rnd ||
2071        this->ftz != that->ftz ||
2072        this->dnz != that->dnz ||
2073        this->cache != that->cache)
2074       return false;
2075 
2076    return true;
2077 }
2078 
2079 bool
isResultEqual(const Instruction * that) const2080 Instruction::isResultEqual(const Instruction *that) const
2081 {
2082    unsigned int d, s;
2083 
2084    // NOTE: location of discard only affects tex with liveOnly and quadops
2085    if (!this->defExists(0) && this->op != OP_DISCARD)
2086       return false;
2087 
2088    if (!isActionEqual(that))
2089       return false;
2090 
2091    if (this->predSrc != that->predSrc)
2092       return false;
2093 
2094    for (d = 0; this->defExists(d); ++d) {
2095       if (!that->defExists(d) ||
2096           !this->getDef(d)->equals(that->getDef(d), false))
2097          return false;
2098    }
2099    if (that->defExists(d))
2100       return false;
2101 
2102    for (s = 0; this->srcExists(s); ++s) {
2103       if (!that->srcExists(s))
2104          return false;
2105       if (this->src(s).mod != that->src(s).mod)
2106          return false;
2107       if (!this->getSrc(s)->equals(that->getSrc(s), true))
2108          return false;
2109    }
2110    if (that->srcExists(s))
2111       return false;
2112 
2113    if (op == OP_LOAD || op == OP_VFETCH) {
2114       switch (src(0).getFile()) {
2115       case FILE_MEMORY_CONST:
2116       case FILE_SHADER_INPUT:
2117          return true;
2118       default:
2119          return false;
2120       }
2121    }
2122 
2123    return true;
2124 }
2125 
2126 // pull through common expressions from different in-blocks
2127 bool
visit(BasicBlock * bb)2128 GlobalCSE::visit(BasicBlock *bb)
2129 {
2130    Instruction *phi, *next, *ik;
2131    int s;
2132 
2133    // TODO: maybe do this with OP_UNION, too
2134 
2135    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
2136       next = phi->next;
2137       if (phi->getSrc(0)->refCount() > 1)
2138          continue;
2139       ik = phi->getSrc(0)->getInsn();
2140       if (!ik)
2141          continue; // probably a function input
2142       for (s = 1; phi->srcExists(s); ++s) {
2143          if (phi->getSrc(s)->refCount() > 1)
2144             break;
2145          if (!phi->getSrc(s)->getInsn() ||
2146              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
2147             break;
2148       }
2149       if (!phi->srcExists(s)) {
2150          Instruction *entry = bb->getEntry();
2151          ik->bb->remove(ik);
2152          if (!entry || entry->op != OP_JOIN)
2153             bb->insertHead(ik);
2154          else
2155             bb->insertAfter(entry, ik);
2156          ik->setDef(0, phi->getDef(0));
2157          delete_Instruction(prog, phi);
2158       }
2159    }
2160 
2161    return true;
2162 }
2163 
2164 bool
tryReplace(Instruction ** ptr,Instruction * i)2165 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
2166 {
2167    Instruction *old = *ptr;
2168 
2169    // TODO: maybe relax this later (causes trouble with OP_UNION)
2170    if (i->isPredicated())
2171       return false;
2172 
2173    if (!old->isResultEqual(i))
2174       return false;
2175 
2176    for (int d = 0; old->defExists(d); ++d)
2177       old->def(d).replace(i->getDef(d), false);
2178    delete_Instruction(prog, old);
2179    *ptr = NULL;
2180    return true;
2181 }
2182 
2183 bool
visit(BasicBlock * bb)2184 LocalCSE::visit(BasicBlock *bb)
2185 {
2186    unsigned int replaced;
2187 
2188    do {
2189       Instruction *ir, *next;
2190 
2191       replaced = 0;
2192 
2193       // will need to know the order of instructions
2194       int serial = 0;
2195       for (ir = bb->getFirst(); ir; ir = ir->next)
2196          ir->serial = serial++;
2197 
2198       for (ir = bb->getEntry(); ir; ir = next) {
2199          int s;
2200          Value *src = NULL;
2201 
2202          next = ir->next;
2203 
2204          if (ir->fixed) {
2205             ops[ir->op].insert(ir);
2206             continue;
2207          }
2208 
2209          for (s = 0; ir->srcExists(s); ++s)
2210             if (ir->getSrc(s)->asLValue())
2211                if (!src || ir->getSrc(s)->refCount() < src->refCount())
2212                   src = ir->getSrc(s);
2213 
2214          if (src) {
2215             for (Value::UseIterator it = src->uses.begin();
2216                  it != src->uses.end(); ++it) {
2217                Instruction *ik = (*it)->getInsn();
2218                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
2219                   if (tryReplace(&ir, ik))
2220                      break;
2221             }
2222          } else {
2223             DLLIST_FOR_EACH(&ops[ir->op], iter)
2224             {
2225                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
2226                if (tryReplace(&ir, ik))
2227                   break;
2228             }
2229          }
2230 
2231          if (ir)
2232             ops[ir->op].insert(ir);
2233          else
2234             ++replaced;
2235       }
2236       for (unsigned int i = 0; i <= OP_LAST; ++i)
2237          ops[i].clear();
2238 
2239    } while (replaced);
2240 
2241    return true;
2242 }
2243 
2244 // =============================================================================
2245 
2246 // Remove computations of unused values.
2247 class DeadCodeElim : public Pass
2248 {
2249 public:
2250    bool buryAll(Program *);
2251 
2252 private:
2253    virtual bool visit(BasicBlock *);
2254 
2255    void checkSplitLoad(Instruction *ld); // for partially dead loads
2256 
2257    unsigned int deadCount;
2258 };
2259 
2260 bool
buryAll(Program * prog)2261 DeadCodeElim::buryAll(Program *prog)
2262 {
2263    do {
2264       deadCount = 0;
2265       if (!this->run(prog, false, false))
2266          return false;
2267    } while (deadCount);
2268 
2269    return true;
2270 }
2271 
2272 bool
visit(BasicBlock * bb)2273 DeadCodeElim::visit(BasicBlock *bb)
2274 {
2275    Instruction *next;
2276 
2277    for (Instruction *i = bb->getFirst(); i; i = next) {
2278       next = i->next;
2279       if (i->isDead()) {
2280          ++deadCount;
2281          delete_Instruction(prog, i);
2282       } else
2283       if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
2284          checkSplitLoad(i);
2285       }
2286    }
2287    return true;
2288 }
2289 
2290 void
checkSplitLoad(Instruction * ld1)2291 DeadCodeElim::checkSplitLoad(Instruction *ld1)
2292 {
2293    Instruction *ld2 = NULL; // can get at most 2 loads
2294    Value *def1[4];
2295    Value *def2[4];
2296    int32_t addr1, addr2;
2297    int32_t size1, size2;
2298    int d, n1, n2;
2299    uint32_t mask = 0xffffffff;
2300 
2301    for (d = 0; ld1->defExists(d); ++d)
2302       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
2303          mask &= ~(1 << d);
2304    if (mask == 0xffffffff)
2305       return;
2306 
2307    addr1 = ld1->getSrc(0)->reg.data.offset;
2308    n1 = n2 = 0;
2309    size1 = size2 = 0;
2310    for (d = 0; ld1->defExists(d); ++d) {
2311       if (mask & (1 << d)) {
2312          if (size1 && (addr1 & 0x7))
2313             break;
2314          def1[n1] = ld1->getDef(d);
2315          size1 += def1[n1++]->reg.size;
2316       } else
2317       if (!n1) {
2318          addr1 += ld1->getDef(d)->reg.size;
2319       } else {
2320          break;
2321       }
2322    }
2323    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
2324       if (mask & (1 << d)) {
2325          def2[n2] = ld1->getDef(d);
2326          size2 += def2[n2++]->reg.size;
2327       } else {
2328          assert(!n2);
2329          addr2 += ld1->getDef(d)->reg.size;
2330       }
2331    }
2332 
2333    updateLdStOffset(ld1, addr1, func);
2334    ld1->setType(typeOfSize(size1));
2335    for (d = 0; d < 4; ++d)
2336       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
2337 
2338    if (!n2)
2339       return;
2340 
2341    ld2 = cloneShallow(func, ld1);
2342    updateLdStOffset(ld2, addr2, func);
2343    ld2->setType(typeOfSize(size2));
2344    for (d = 0; d < 4; ++d)
2345       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
2346 
2347    ld1->bb->insertAfter(ld1, ld2);
2348 }
2349 
2350 // =============================================================================
2351 
2352 #define RUN_PASS(l, n, f)                       \
2353    if (level >= (l)) {                          \
2354       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
2355          INFO("PEEPHOLE: %s\n", #n);            \
2356       n pass;                                   \
2357       if (!pass.f(this))                        \
2358          return false;                          \
2359    }
2360 
2361 bool
optimizeSSA(int level)2362 Program::optimizeSSA(int level)
2363 {
2364    RUN_PASS(1, DeadCodeElim, buryAll);
2365    RUN_PASS(1, CopyPropagation, run);
2366    RUN_PASS(2, GlobalCSE, run);
2367    RUN_PASS(1, LocalCSE, run);
2368    RUN_PASS(2, AlgebraicOpt, run);
2369    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
2370    RUN_PASS(1, ConstantFolding, foldAll);
2371    RUN_PASS(1, LoadPropagation, run);
2372    RUN_PASS(2, MemoryOpt, run);
2373    RUN_PASS(2, LocalCSE, run);
2374    RUN_PASS(0, DeadCodeElim, buryAll);
2375 
2376    return true;
2377 }
2378 
2379 bool
optimizePostRA(int level)2380 Program::optimizePostRA(int level)
2381 {
2382    RUN_PASS(2, FlatteningPass, run);
2383    return true;
2384 }
2385 
2386 }
2387