1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_target.h"
25 #include "codegen/nv50_ir_build_util.h"
26 
27 extern "C" {
28 #include "util/u_math.h"
29 }
30 
31 namespace nv50_ir {
32 
33 bool
isNop() const34 Instruction::isNop() const
35 {
36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37       return true;
38    if (terminator || join) // XXX: should terminator imply flow ?
39       return false;
40    if (op == OP_ATOM)
41       return false;
42    if (!fixed && op == OP_NOP)
43       return true;
44 
45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46       for (int d = 1; defExists(d); ++d)
47          if (def(d).rep()->reg.data.id >= 0)
48             WARN("part of vector result is unused !\n");
49       return true;
50    }
51 
52    if (op == OP_MOV || op == OP_UNION) {
53       if (!getDef(0)->equals(getSrc(0)))
54          return false;
55       if (op == OP_UNION)
56          if (!def(0).rep()->equals(getSrc(1)))
57             return false;
58       return true;
59    }
60 
61    return false;
62 }
63 
isDead() const64 bool Instruction::isDead() const
65 {
66    if (op == OP_STORE ||
67        op == OP_EXPORT ||
68        op == OP_ATOM ||
69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
70        op == OP_WRSV)
71       return false;
72 
73    for (int d = 0; defExists(d); ++d)
74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
75          return false;
76 
77    if (terminator || asFlow())
78       return false;
79    if (fixed)
80       return false;
81 
82    return true;
83 };
84 
85 // =============================================================================
86 
87 class CopyPropagation : public Pass
88 {
89 private:
90    virtual bool visit(BasicBlock *);
91 };
92 
93 // Propagate all MOVs forward to make subsequent optimization easier, except if
94 // the sources stem from a phi, in which case we don't want to mess up potential
95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
96 bool
visit(BasicBlock * bb)97 CopyPropagation::visit(BasicBlock *bb)
98 {
99    Instruction *mov, *si, *next;
100 
101    for (mov = bb->getEntry(); mov; mov = next) {
102       next = mov->next;
103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
104          continue;
105       if (mov->getPredicate())
106          continue;
107       if (mov->def(0).getFile() != mov->src(0).getFile())
108          continue;
109       si = mov->getSrc(0)->getInsn();
110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
111          // propagate
112          mov->def(0).replace(mov->getSrc(0), false);
113          delete_Instruction(prog, mov);
114       }
115    }
116    return true;
117 }
118 
119 // =============================================================================
120 
121 class MergeSplits : public Pass
122 {
123 private:
124    virtual bool visit(BasicBlock *);
125 };
126 
127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
128 // post-merge def with the SPLIT's source.
129 bool
visit(BasicBlock * bb)130 MergeSplits::visit(BasicBlock *bb)
131 {
132    Instruction *i, *next, *si;
133 
134    for (i = bb->getEntry(); i; i = next) {
135       next = i->next;
136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
137          continue;
138       si = i->getSrc(0)->getInsn();
139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
140          continue;
141       i->def(0).replace(si->getSrc(0), false);
142       delete_Instruction(prog, i);
143    }
144 
145    return true;
146 }
147 
148 // =============================================================================
149 
150 class LoadPropagation : public Pass
151 {
152 private:
153    virtual bool visit(BasicBlock *);
154 
155    void checkSwapSrc01(Instruction *);
156 
157    bool isCSpaceLoad(Instruction *);
158    bool isImmdLoad(Instruction *);
159    bool isAttribOrSharedLoad(Instruction *);
160 };
161 
162 bool
isCSpaceLoad(Instruction * ld)163 LoadPropagation::isCSpaceLoad(Instruction *ld)
164 {
165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
166 }
167 
168 bool
isImmdLoad(Instruction * ld)169 LoadPropagation::isImmdLoad(Instruction *ld)
170 {
171    if (!ld || (ld->op != OP_MOV) ||
172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
173       return false;
174 
175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
176    ImmediateValue val;
177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
178 }
179 
180 bool
isAttribOrSharedLoad(Instruction * ld)181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
182 {
183    return ld &&
184       (ld->op == OP_VFETCH ||
185        (ld->op == OP_LOAD &&
186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
188 }
189 
190 void
checkSwapSrc01(Instruction * insn)191 LoadPropagation::checkSwapSrc01(Instruction *insn)
192 {
193    const Target *targ = prog->getTarget();
194    if (!targ->getOpInfo(insn).commutative)
195       if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
196          return;
197    if (insn->src(1).getFile() != FILE_GPR)
198       return;
199    // This is the special OP_SET used for alphatesting, we can't reverse its
200    // arguments as that will confuse the fixup code.
201    if (insn->op == OP_SET && insn->subOp)
202       return;
203 
204    Instruction *i0 = insn->getSrc(0)->getInsn();
205    Instruction *i1 = insn->getSrc(1)->getInsn();
206 
207    // Swap sources to inline the less frequently used source. That way,
208    // optimistically, it will eventually be able to remove the instruction.
209    int i0refs = insn->getSrc(0)->refCount();
210    int i1refs = insn->getSrc(1)->refCount();
211 
212    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
213       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
214           !targ->insnCanLoad(insn, 1, i1) ||
215           i0refs < i1refs)
216          insn->swapSources(0, 1);
217       else
218          return;
219    } else
220    if (isAttribOrSharedLoad(i1)) {
221       if (!isAttribOrSharedLoad(i0))
222          insn->swapSources(0, 1);
223       else
224          return;
225    } else {
226       return;
227    }
228 
229    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
230        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
231       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
232    else
233    if (insn->op == OP_SLCT)
234       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
235    else
236    if (insn->op == OP_SUB) {
237       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
238       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
239    }
240 }
241 
242 bool
visit(BasicBlock * bb)243 LoadPropagation::visit(BasicBlock *bb)
244 {
245    const Target *targ = prog->getTarget();
246    Instruction *next;
247 
248    for (Instruction *i = bb->getEntry(); i; i = next) {
249       next = i->next;
250 
251       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
252          continue;
253 
254       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
255          continue;
256 
257       if (i->srcExists(1))
258          checkSwapSrc01(i);
259 
260       for (int s = 0; i->srcExists(s); ++s) {
261          Instruction *ld = i->getSrc(s)->getInsn();
262 
263          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
264             continue;
265          if (!targ->insnCanLoad(i, s, ld))
266             continue;
267 
268          // propagate !
269          i->setSrc(s, ld->getSrc(0));
270          if (ld->src(0).isIndirect(0))
271             i->setIndirect(s, 0, ld->getIndirect(0, 0));
272 
273          if (ld->getDef(0)->refCount() == 0)
274             delete_Instruction(prog, ld);
275       }
276    }
277    return true;
278 }
279 
280 // =============================================================================
281 
282 class IndirectPropagation : public Pass
283 {
284 private:
285    virtual bool visit(BasicBlock *);
286 };
287 
288 bool
visit(BasicBlock * bb)289 IndirectPropagation::visit(BasicBlock *bb)
290 {
291    const Target *targ = prog->getTarget();
292    Instruction *next;
293 
294    for (Instruction *i = bb->getEntry(); i; i = next) {
295       next = i->next;
296 
297       for (int s = 0; i->srcExists(s); ++s) {
298          Instruction *insn;
299          ImmediateValue imm;
300          if (!i->src(s).isIndirect(0))
301             continue;
302          insn = i->getIndirect(s, 0)->getInsn();
303          if (!insn)
304             continue;
305          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
306             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
307                 !insn->src(1).getImmediate(imm) ||
308                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
309                continue;
310             i->setIndirect(s, 0, insn->getSrc(0));
311             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
312             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
313          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
314             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
315                 !insn->src(1).getImmediate(imm) ||
316                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
317                continue;
318             i->setIndirect(s, 0, insn->getSrc(0));
319             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
320             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
321          } else if (insn->op == OP_MOV) {
322             if (!insn->src(0).getImmediate(imm) ||
323                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
324                continue;
325             i->setIndirect(s, 0, NULL);
326             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
327             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
328          }
329       }
330    }
331    return true;
332 }
333 
334 // =============================================================================
335 
336 // Evaluate constant expressions.
337 class ConstantFolding : public Pass
338 {
339 public:
340    bool foldAll(Program *);
341 
342 private:
343    virtual bool visit(BasicBlock *);
344 
345    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
346    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
347    void opnd(Instruction *, ImmediateValue&, int s);
348    void opnd3(Instruction *, ImmediateValue&);
349 
350    void unary(Instruction *, const ImmediateValue&);
351 
352    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
353 
354    CmpInstruction *findOriginForTestWithZero(Value *);
355 
356    unsigned int foldCount;
357 
358    BuildUtil bld;
359 };
360 
361 // TODO: remember generated immediates and only revisit these
362 bool
foldAll(Program * prog)363 ConstantFolding::foldAll(Program *prog)
364 {
365    unsigned int iterCount = 0;
366    do {
367       foldCount = 0;
368       if (!run(prog))
369          return false;
370    } while (foldCount && ++iterCount < 2);
371    return true;
372 }
373 
374 bool
visit(BasicBlock * bb)375 ConstantFolding::visit(BasicBlock *bb)
376 {
377    Instruction *i, *next;
378 
379    for (i = bb->getEntry(); i; i = next) {
380       next = i->next;
381       if (i->op == OP_MOV || i->op == OP_CALL)
382          continue;
383 
384       ImmediateValue src0, src1, src2;
385 
386       if (i->srcExists(2) &&
387           i->src(0).getImmediate(src0) &&
388           i->src(1).getImmediate(src1) &&
389           i->src(2).getImmediate(src2))
390          expr(i, src0, src1, src2);
391       else
392       if (i->srcExists(1) &&
393           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
394          expr(i, src0, src1);
395       else
396       if (i->srcExists(0) && i->src(0).getImmediate(src0))
397          opnd(i, src0, 0);
398       else
399       if (i->srcExists(1) && i->src(1).getImmediate(src1))
400          opnd(i, src1, 1);
401       if (i->srcExists(2) && i->src(2).getImmediate(src2))
402          opnd3(i, src2);
403    }
404    return true;
405 }
406 
407 CmpInstruction *
findOriginForTestWithZero(Value * value)408 ConstantFolding::findOriginForTestWithZero(Value *value)
409 {
410    if (!value)
411       return NULL;
412    Instruction *insn = value->getInsn();
413    if (!insn)
414       return NULL;
415 
416    if (insn->asCmp() && insn->op != OP_SLCT)
417       return insn->asCmp();
418 
419    /* Sometimes mov's will sneak in as a result of other folding. This gets
420     * cleaned up later.
421     */
422    if (insn->op == OP_MOV)
423       return findOriginForTestWithZero(insn->getSrc(0));
424 
425    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
426    if (insn->op == OP_AND) {
427       int s = 0;
428       ImmediateValue imm;
429       if (!insn->src(s).getImmediate(imm)) {
430          s = 1;
431          if (!insn->src(s).getImmediate(imm))
432             return NULL;
433       }
434       if (imm.reg.data.f32 != 1.0f)
435          return NULL;
436       /* TODO: Come up with a way to handle the condition being inverted */
437       if (insn->src(!s).mod != Modifier(0))
438          return NULL;
439       return findOriginForTestWithZero(insn->getSrc(!s));
440    }
441 
442    return NULL;
443 }
444 
445 void
applyTo(ImmediateValue & imm) const446 Modifier::applyTo(ImmediateValue& imm) const
447 {
448    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
449       return;
450    switch (imm.reg.type) {
451    case TYPE_F32:
452       if (bits & NV50_IR_MOD_ABS)
453          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
454       if (bits & NV50_IR_MOD_NEG)
455          imm.reg.data.f32 = -imm.reg.data.f32;
456       if (bits & NV50_IR_MOD_SAT) {
457          if (imm.reg.data.f32 < 0.0f)
458             imm.reg.data.f32 = 0.0f;
459          else
460          if (imm.reg.data.f32 > 1.0f)
461             imm.reg.data.f32 = 1.0f;
462       }
463       assert(!(bits & NV50_IR_MOD_NOT));
464       break;
465 
466    case TYPE_S8: // NOTE: will be extended
467    case TYPE_S16:
468    case TYPE_S32:
469    case TYPE_U8: // NOTE: treated as signed
470    case TYPE_U16:
471    case TYPE_U32:
472       if (bits & NV50_IR_MOD_ABS)
473          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
474             imm.reg.data.s32 : -imm.reg.data.s32;
475       if (bits & NV50_IR_MOD_NEG)
476          imm.reg.data.s32 = -imm.reg.data.s32;
477       if (bits & NV50_IR_MOD_NOT)
478          imm.reg.data.s32 = ~imm.reg.data.s32;
479       break;
480 
481    case TYPE_F64:
482       if (bits & NV50_IR_MOD_ABS)
483          imm.reg.data.f64 = fabs(imm.reg.data.f64);
484       if (bits & NV50_IR_MOD_NEG)
485          imm.reg.data.f64 = -imm.reg.data.f64;
486       if (bits & NV50_IR_MOD_SAT) {
487          if (imm.reg.data.f64 < 0.0)
488             imm.reg.data.f64 = 0.0;
489          else
490          if (imm.reg.data.f64 > 1.0)
491             imm.reg.data.f64 = 1.0;
492       }
493       assert(!(bits & NV50_IR_MOD_NOT));
494       break;
495 
496    default:
497       assert(!"invalid/unhandled type");
498       imm.reg.data.u64 = 0;
499       break;
500    }
501 }
502 
503 operation
getOp() const504 Modifier::getOp() const
505 {
506    switch (bits) {
507    case NV50_IR_MOD_ABS: return OP_ABS;
508    case NV50_IR_MOD_NEG: return OP_NEG;
509    case NV50_IR_MOD_SAT: return OP_SAT;
510    case NV50_IR_MOD_NOT: return OP_NOT;
511    case 0:
512       return OP_MOV;
513    default:
514       return OP_CVT;
515    }
516 }
517 
518 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)519 ConstantFolding::expr(Instruction *i,
520                       ImmediateValue &imm0, ImmediateValue &imm1)
521 {
522    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
523    struct Storage res;
524    DataType type = i->dType;
525 
526    memset(&res.data, 0, sizeof(res.data));
527 
528    switch (i->op) {
529    case OP_MAD:
530    case OP_FMA:
531    case OP_MUL:
532       if (i->dnz && i->dType == TYPE_F32) {
533          if (!isfinite(a->data.f32))
534             a->data.f32 = 0.0f;
535          if (!isfinite(b->data.f32))
536             b->data.f32 = 0.0f;
537       }
538       switch (i->dType) {
539       case TYPE_F32:
540          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
541          break;
542       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
543       case TYPE_S32:
544          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
545             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
546             break;
547          }
548          /* fallthrough */
549       case TYPE_U32:
550          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
551             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
552             break;
553          }
554          res.data.u32 = a->data.u32 * b->data.u32; break;
555       default:
556          return;
557       }
558       break;
559    case OP_DIV:
560       if (b->data.u32 == 0)
561          break;
562       switch (i->dType) {
563       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
564       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
565       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
566       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
567       default:
568          return;
569       }
570       break;
571    case OP_ADD:
572       switch (i->dType) {
573       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
574       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
575       case TYPE_S32:
576       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
577       default:
578          return;
579       }
580       break;
581    case OP_SUB:
582       switch (i->dType) {
583       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
584       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
585       case TYPE_S32:
586       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
587       default:
588          return;
589       }
590       break;
591    case OP_POW:
592       switch (i->dType) {
593       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
594       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
595       default:
596          return;
597       }
598       break;
599    case OP_MAX:
600       switch (i->dType) {
601       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
602       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
603       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
604       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
605       default:
606          return;
607       }
608       break;
609    case OP_MIN:
610       switch (i->dType) {
611       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
612       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
613       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
614       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
615       default:
616          return;
617       }
618       break;
619    case OP_AND:
620       res.data.u64 = a->data.u64 & b->data.u64;
621       break;
622    case OP_OR:
623       res.data.u64 = a->data.u64 | b->data.u64;
624       break;
625    case OP_XOR:
626       res.data.u64 = a->data.u64 ^ b->data.u64;
627       break;
628    case OP_SHL:
629       res.data.u32 = a->data.u32 << b->data.u32;
630       break;
631    case OP_SHR:
632       switch (i->dType) {
633       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
634       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
635       default:
636          return;
637       }
638       break;
639    case OP_SLCT:
640       if (a->data.u32 != b->data.u32)
641          return;
642       res.data.u32 = a->data.u32;
643       break;
644    case OP_EXTBF: {
645       int offset = b->data.u32 & 0xff;
646       int width = (b->data.u32 >> 8) & 0xff;
647       int rshift = offset;
648       int lshift = 0;
649       if (width == 0) {
650          res.data.u32 = 0;
651          break;
652       }
653       if (width + offset < 32) {
654          rshift = 32 - width;
655          lshift = 32 - width - offset;
656       }
657       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
658          res.data.u32 = util_bitreverse(a->data.u32);
659       else
660          res.data.u32 = a->data.u32;
661       switch (i->dType) {
662       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
663       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
664       default:
665          return;
666       }
667       break;
668    }
669    case OP_POPCNT:
670       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
671       break;
672    case OP_PFETCH:
673       // The two arguments to pfetch are logically added together. Normally
674       // the second argument will not be constant, but that can happen.
675       res.data.u32 = a->data.u32 + b->data.u32;
676       type = TYPE_U32;
677       break;
678    case OP_MERGE:
679       switch (i->dType) {
680       case TYPE_U64:
681       case TYPE_S64:
682       case TYPE_F64:
683          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
684          break;
685       default:
686          return;
687       }
688       break;
689    default:
690       return;
691    }
692    ++foldCount;
693 
694    i->src(0).mod = Modifier(0);
695    i->src(1).mod = Modifier(0);
696    i->postFactor = 0;
697 
698    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
699    i->setSrc(1, NULL);
700 
701    i->getSrc(0)->reg.data = res.data;
702    i->getSrc(0)->reg.type = type;
703    i->getSrc(0)->reg.size = typeSizeof(type);
704 
705    switch (i->op) {
706    case OP_MAD:
707    case OP_FMA: {
708       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
709 
710       // Move the immediate into position 1, where we know it might be
711       // emittable. However it might not be anyways, as there may be other
712       // restrictions, so move it into a separate LValue.
713       bld.setPosition(i, false);
714       i->op = OP_ADD;
715       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
716       i->setSrc(0, i->getSrc(2));
717       i->src(0).mod = i->src(2).mod;
718       i->setSrc(2, NULL);
719 
720       if (i->src(0).getImmediate(src0))
721          expr(i, src0, src1);
722       else
723          opnd(i, src1, 1);
724       break;
725    }
726    case OP_PFETCH:
727       // Leave PFETCH alone... we just folded its 2 args into 1.
728       break;
729    default:
730       i->op = i->saturate ? OP_SAT : OP_MOV;
731       if (i->saturate)
732          unary(i, *i->getSrc(0)->asImm());
733       break;
734    }
735    i->subOp = 0;
736 }
737 
738 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1,ImmediateValue & imm2)739 ConstantFolding::expr(Instruction *i,
740                       ImmediateValue &imm0,
741                       ImmediateValue &imm1,
742                       ImmediateValue &imm2)
743 {
744    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
745    struct Storage res;
746 
747    memset(&res.data, 0, sizeof(res.data));
748 
749    switch (i->op) {
750    case OP_INSBF: {
751       int offset = b->data.u32 & 0xff;
752       int width = (b->data.u32 >> 8) & 0xff;
753       unsigned bitmask = ((1 << width) - 1) << offset;
754       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
755       break;
756    }
757    case OP_MAD:
758    case OP_FMA: {
759       switch (i->dType) {
760       case TYPE_F32:
761          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
762             c->data.f32;
763          break;
764       case TYPE_F64:
765          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
766          break;
767       case TYPE_S32:
768          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
769             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
770             break;
771          }
772          /* fallthrough */
773       case TYPE_U32:
774          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
775             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
776             break;
777          }
778          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
779          break;
780       default:
781          return;
782       }
783       break;
784    }
785    case OP_SHLADD:
786       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
787       break;
788    default:
789       return;
790    }
791 
792    ++foldCount;
793    i->src(0).mod = Modifier(0);
794    i->src(1).mod = Modifier(0);
795    i->src(2).mod = Modifier(0);
796 
797    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
798    i->setSrc(1, NULL);
799    i->setSrc(2, NULL);
800 
801    i->getSrc(0)->reg.data = res.data;
802    i->getSrc(0)->reg.type = i->dType;
803    i->getSrc(0)->reg.size = typeSizeof(i->dType);
804 
805    i->op = OP_MOV;
806 }
807 
808 void
unary(Instruction * i,const ImmediateValue & imm)809 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
810 {
811    Storage res;
812 
813    if (i->dType != TYPE_F32)
814       return;
815    switch (i->op) {
816    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
817    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
818    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
819    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
820    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
821    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
822    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
823    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
824    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
825    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
826    case OP_PRESIN:
827    case OP_PREEX2:
828       // these should be handled in subsequent OP_SIN/COS/EX2
829       res.data.f32 = imm.reg.data.f32;
830       break;
831    default:
832       return;
833    }
834    i->op = OP_MOV;
835    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
836    i->src(0).mod = Modifier(0);
837 }
838 
839 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)840 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
841                                         const int s, ImmediateValue& imm2)
842 {
843    const int t = s ? 0 : 1;
844    Instruction *insn;
845    Instruction *mul1 = NULL; // mul1 before mul2
846    int e = 0;
847    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
848    ImmediateValue imm1;
849 
850    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
851 
852    if (mul2->getSrc(t)->refCount() == 1) {
853       insn = mul2->getSrc(t)->getInsn();
854       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
855          mul1 = insn;
856       if (mul1 && !mul1->saturate) {
857          int s1;
858 
859          if (mul1->src(s1 = 0).getImmediate(imm1) ||
860              mul1->src(s1 = 1).getImmediate(imm1)) {
861             bld.setPosition(mul1, false);
862             // a = mul r, imm1
863             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
864             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
865             mul1->src(s1).mod = Modifier(0);
866             mul2->def(0).replace(mul1->getDef(0), false);
867             mul1->saturate = mul2->saturate;
868          } else
869          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
870             // c = mul a, b
871             // d = mul c, imm   -> d = mul_x_imm a, b
872             mul1->postFactor = e;
873             mul2->def(0).replace(mul1->getDef(0), false);
874             if (f < 0)
875                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
876             mul1->saturate = mul2->saturate;
877          }
878          return;
879       }
880    }
881    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
882       // b = mul a, imm
883       // d = mul b, c   -> d = mul_x_imm a, c
884       int s2, t2;
885       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
886       if (!insn)
887          return;
888       mul1 = mul2;
889       mul2 = NULL;
890       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
891       t2 = s2 ? 0 : 1;
892       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
893          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
894             mul2 = insn;
895       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
896          mul2->postFactor = e;
897          mul2->setSrc(s2, mul1->src(t));
898          if (f < 0)
899             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
900       }
901    }
902 }
903 
904 void
opnd3(Instruction * i,ImmediateValue & imm2)905 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
906 {
907    switch (i->op) {
908    case OP_MAD:
909    case OP_FMA:
910       if (imm2.isInteger(0)) {
911          i->op = OP_MUL;
912          i->setSrc(2, NULL);
913          foldCount++;
914          return;
915       }
916       break;
917    case OP_SHLADD:
918       if (imm2.isInteger(0)) {
919          i->op = OP_SHL;
920          i->setSrc(2, NULL);
921          foldCount++;
922          return;
923       }
924       break;
925    default:
926       return;
927    }
928 }
929 
930 void
opnd(Instruction * i,ImmediateValue & imm0,int s)931 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
932 {
933    const Target *target = prog->getTarget();
934    const int t = !s;
935    const operation op = i->op;
936    Instruction *newi = i;
937 
938    switch (i->op) {
939    case OP_SPLIT: {
940       bld.setPosition(i, false);
941 
942       uint8_t size = i->getDef(0)->reg.size;
943       uint8_t bitsize = size * 8;
944       uint32_t mask = (1ULL << bitsize) - 1;
945       assert(bitsize <= 32);
946 
947       uint64_t val = imm0.reg.data.u64;
948       for (int8_t d = 0; i->defExists(d); ++d) {
949          Value *def = i->getDef(d);
950          assert(def->reg.size == size);
951 
952          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
953          val >>= bitsize;
954       }
955       delete_Instruction(prog, i);
956       break;
957    }
958    case OP_MUL:
959       if (i->dType == TYPE_F32)
960          tryCollapseChainedMULs(i, s, imm0);
961 
962       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
963          assert(!isFloatType(i->sType));
964          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
965             bld.setPosition(i, false);
966             // Need to set to the sign value, which is a compare.
967             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
968                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
969             delete_Instruction(prog, i);
970          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
971             // The high bits can't be set in this case (either mul by 0 or
972             // unsigned by 1)
973             i->op = OP_MOV;
974             i->subOp = 0;
975             i->setSrc(0, new_ImmediateValue(prog, 0u));
976             i->src(0).mod = Modifier(0);
977             i->setSrc(1, NULL);
978          } else if (!imm0.isNegative() && imm0.isPow2()) {
979             // Translate into a shift
980             imm0.applyLog2();
981             i->op = OP_SHR;
982             i->subOp = 0;
983             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
984             i->setSrc(0, i->getSrc(t));
985             i->src(0).mod = i->src(t).mod;
986             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
987             i->src(1).mod = 0;
988          }
989       } else
990       if (imm0.isInteger(0)) {
991          i->op = OP_MOV;
992          i->setSrc(0, new_ImmediateValue(prog, 0u));
993          i->src(0).mod = Modifier(0);
994          i->postFactor = 0;
995          i->setSrc(1, NULL);
996       } else
997       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
998          if (imm0.isNegative())
999             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1000          i->op = i->src(t).mod.getOp();
1001          if (s == 0) {
1002             i->setSrc(0, i->getSrc(1));
1003             i->src(0).mod = i->src(1).mod;
1004             i->src(1).mod = 0;
1005          }
1006          if (i->op != OP_CVT)
1007             i->src(0).mod = 0;
1008          i->setSrc(1, NULL);
1009       } else
1010       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1011          if (imm0.isNegative())
1012             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1013          i->op = OP_ADD;
1014          i->setSrc(s, i->getSrc(t));
1015          i->src(s).mod = i->src(t).mod;
1016       } else
1017       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
1018          i->op = OP_SHL;
1019          imm0.applyLog2();
1020          i->setSrc(0, i->getSrc(t));
1021          i->src(0).mod = i->src(t).mod;
1022          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1023          i->src(1).mod = 0;
1024       } else
1025       if (i->postFactor && i->sType == TYPE_F32) {
1026          /* Can't emit a postfactor with an immediate, have to fold it in */
1027          i->setSrc(s, new_ImmediateValue(
1028                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1029          i->postFactor = 0;
1030       }
1031       break;
1032    case OP_FMA:
1033    case OP_MAD:
1034       if (imm0.isInteger(0)) {
1035          i->setSrc(0, i->getSrc(2));
1036          i->src(0).mod = i->src(2).mod;
1037          i->setSrc(1, NULL);
1038          i->setSrc(2, NULL);
1039          i->op = i->src(0).mod.getOp();
1040          if (i->op != OP_CVT)
1041             i->src(0).mod = 0;
1042       } else
1043       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1044           (imm0.isInteger(1) || imm0.isInteger(-1))) {
1045          if (imm0.isNegative())
1046             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1047          if (s == 0) {
1048             i->setSrc(0, i->getSrc(1));
1049             i->src(0).mod = i->src(1).mod;
1050          }
1051          i->setSrc(1, i->getSrc(2));
1052          i->src(1).mod = i->src(2).mod;
1053          i->setSrc(2, NULL);
1054          i->op = OP_ADD;
1055       } else
1056       if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
1057           !isFloatType(i->dType) &&
1058           target->isOpSupported(OP_SHLADD, i->dType)) {
1059          i->op = OP_SHLADD;
1060          imm0.applyLog2();
1061          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1062       }
1063       break;
1064    case OP_SUB:
1065       if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
1066           !isFloatType(i->dType))
1067          break;
1068       /* fallthrough */
1069    case OP_ADD:
1070       if (i->usesFlags())
1071          break;
1072       if (imm0.isInteger(0)) {
1073          if (s == 0) {
1074             i->setSrc(0, i->getSrc(1));
1075             i->src(0).mod = i->src(1).mod;
1076             if (i->op == OP_SUB)
1077                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1078          }
1079          i->setSrc(1, NULL);
1080          i->op = i->src(0).mod.getOp();
1081          if (i->op != OP_CVT)
1082             i->src(0).mod = Modifier(0);
1083       }
1084       break;
1085 
1086    case OP_DIV:
1087       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1088          break;
1089       bld.setPosition(i, false);
1090       if (imm0.reg.data.u32 == 0) {
1091          break;
1092       } else
1093       if (imm0.reg.data.u32 == 1) {
1094          i->op = OP_MOV;
1095          i->setSrc(1, NULL);
1096       } else
1097       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1098          i->op = OP_SHR;
1099          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1100       } else
1101       if (i->dType == TYPE_U32) {
1102          Instruction *mul;
1103          Value *tA, *tB;
1104          const uint32_t d = imm0.reg.data.u32;
1105          uint32_t m;
1106          int r, s;
1107          uint32_t l = util_logbase2(d);
1108          if (((uint32_t)1 << l) < d)
1109             ++l;
1110          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1111          r = l ? 1 : 0;
1112          s = l ? (l - 1) : 0;
1113 
1114          tA = bld.getSSA();
1115          tB = bld.getSSA();
1116          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1117                          bld.loadImm(NULL, m));
1118          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1119          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1120          tA = bld.getSSA();
1121          if (r)
1122             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1123          else
1124             tA = tB;
1125          tB = s ? bld.getSSA() : i->getDef(0);
1126          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1127          if (s)
1128             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1129 
1130          delete_Instruction(prog, i);
1131       } else
1132       if (imm0.reg.data.s32 == -1) {
1133          i->op = OP_NEG;
1134          i->setSrc(1, NULL);
1135       } else {
1136          LValue *tA, *tB;
1137          LValue *tD;
1138          const int32_t d = imm0.reg.data.s32;
1139          int32_t m;
1140          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1141          if ((1 << l) < abs(d))
1142             ++l;
1143          if (!l)
1144             l = 1;
1145          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1146 
1147          tA = bld.getSSA();
1148          tB = bld.getSSA();
1149          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1150                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1151          if (l > 1)
1152             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1153          else
1154             tB = tA;
1155          tA = bld.getSSA();
1156          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1157          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1158          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1159          if (d < 0)
1160             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1161 
1162          delete_Instruction(prog, i);
1163       }
1164       break;
1165 
1166    case OP_MOD:
1167       if (s == 1 && imm0.isPow2()) {
1168          bld.setPosition(i, false);
1169          if (i->sType == TYPE_U32) {
1170             i->op = OP_AND;
1171             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1172          } else if (i->sType == TYPE_S32) {
1173             // Do it on the absolute value of the input, and then restore the
1174             // sign. The only odd case is MIN_INT, but that should work out
1175             // as well, since MIN_INT mod any power of 2 is 0.
1176             //
1177             // Technically we don't have to do any of this since MOD is
1178             // undefined with negative arguments in GLSL, but this seems like
1179             // the nice thing to do.
1180             Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
1181             Value *neg, *v1, *v2;
1182             bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
1183                       (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
1184                       TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
1185             Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
1186                                     bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1187             bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
1188                ->setPredicate(CC_P, neg);
1189             bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
1190                ->setPredicate(CC_NOT_P, neg);
1191             newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
1192 
1193             delete_Instruction(prog, i);
1194          }
1195       } else if (s == 1) {
1196          // In this case, we still want the optimized lowering that we get
1197          // from having division by an immediate.
1198          //
1199          // a % b == a - (a/b) * b
1200          bld.setPosition(i, false);
1201          Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
1202                                  i->getSrc(0), i->getSrc(1));
1203          newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
1204                           bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
1205          // TODO: Check that target supports this. In this case, we know that
1206          // all backends do.
1207          newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
1208 
1209          delete_Instruction(prog, i);
1210       }
1211       break;
1212 
1213    case OP_SET: // TODO: SET_AND,OR,XOR
1214    {
1215       /* This optimizes the case where the output of a set is being compared
1216        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1217        * can be a lot cleverer in our comparison.
1218        */
1219       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1220       CondCode cc, ccZ;
1221       if (imm0.reg.data.u32 != 0 || !si)
1222          return;
1223       cc = si->setCond;
1224       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1225       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1226       // first.
1227       if (s == 0)
1228          ccZ = reverseCondCode(ccZ);
1229       // If there is a negative modifier, we need to undo that, by flipping
1230       // the comparison to zero.
1231       if (i->src(t).mod.neg())
1232          ccZ = reverseCondCode(ccZ);
1233       // If this is a signed comparison, we expect the input to be a regular
1234       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1235       // is positive, so just flip the sign.
1236       if (i->sType == TYPE_S32) {
1237          assert(!isFloatType(si->dType));
1238          ccZ = reverseCondCode(ccZ);
1239       }
1240       switch (ccZ) {
1241       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1242       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1243       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1244       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1245       case CC_GT: break; // bool > 0 -- bool
1246       case CC_NE: break; // bool != 0 -- bool
1247       default:
1248          return;
1249       }
1250 
1251       // Update the condition of this SET to be identical to the origin set,
1252       // but with the updated condition code. The original SET should get
1253       // DCE'd, ideally.
1254       i->op = si->op;
1255       i->asCmp()->setCond = cc;
1256       i->setSrc(0, si->src(0));
1257       i->setSrc(1, si->src(1));
1258       if (si->srcExists(2))
1259          i->setSrc(2, si->src(2));
1260       i->sType = si->sType;
1261    }
1262       break;
1263 
1264    case OP_AND:
1265    {
1266       Instruction *src = i->getSrc(t)->getInsn();
1267       ImmediateValue imm1;
1268       if (imm0.reg.data.u32 == 0) {
1269          i->op = OP_MOV;
1270          i->setSrc(0, new_ImmediateValue(prog, 0u));
1271          i->src(0).mod = Modifier(0);
1272          i->setSrc(1, NULL);
1273       } else if (imm0.reg.data.u32 == ~0U) {
1274          i->op = i->src(t).mod.getOp();
1275          if (t) {
1276             i->setSrc(0, i->getSrc(t));
1277             i->src(0).mod = i->src(t).mod;
1278          }
1279          i->setSrc(1, NULL);
1280       } else if (src->asCmp()) {
1281          CmpInstruction *cmp = src->asCmp();
1282          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1283             return;
1284          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1285             return;
1286          if (imm0.reg.data.f32 != 1.0)
1287             return;
1288          if (cmp->dType != TYPE_U32)
1289             return;
1290 
1291          cmp->dType = TYPE_F32;
1292          if (i->src(t).mod != Modifier(0)) {
1293             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1294             i->src(t).mod = Modifier(0);
1295             cmp->setCond = inverseCondCode(cmp->setCond);
1296          }
1297          i->op = OP_MOV;
1298          i->setSrc(s, NULL);
1299          if (t) {
1300             i->setSrc(0, i->getSrc(t));
1301             i->setSrc(t, NULL);
1302          }
1303       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1304                  src->op == OP_SHR &&
1305                  src->src(1).getImmediate(imm1) &&
1306                  i->src(t).mod == Modifier(0) &&
1307                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
1308          // low byte = offset, high byte = width
1309          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1310          i->op = OP_EXTBF;
1311          i->setSrc(0, src->getSrc(0));
1312          i->setSrc(1, new_ImmediateValue(prog, ext));
1313       } else if (src->op == OP_SHL &&
1314                  src->src(1).getImmediate(imm1) &&
1315                  i->src(t).mod == Modifier(0) &&
1316                  util_is_power_of_two(~imm0.reg.data.u32 + 1) &&
1317                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1318          i->op = OP_MOV;
1319          i->setSrc(s, NULL);
1320          if (t) {
1321             i->setSrc(0, i->getSrc(t));
1322             i->setSrc(t, NULL);
1323          }
1324       }
1325    }
1326       break;
1327 
1328    case OP_SHL:
1329    {
1330       if (s != 1 || i->src(0).mod != Modifier(0))
1331          break;
1332       // try to concatenate shifts
1333       Instruction *si = i->getSrc(0)->getInsn();
1334       if (!si)
1335          break;
1336       ImmediateValue imm1;
1337       switch (si->op) {
1338       case OP_SHL:
1339          if (si->src(1).getImmediate(imm1)) {
1340             bld.setPosition(i, false);
1341             i->setSrc(0, si->getSrc(0));
1342             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1343          }
1344          break;
1345       case OP_SHR:
1346          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1347             bld.setPosition(i, false);
1348             i->op = OP_AND;
1349             i->setSrc(0, si->getSrc(0));
1350             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1351          }
1352          break;
1353       case OP_MUL:
1354          int muls;
1355          if (isFloatType(si->dType))
1356             return;
1357          if (si->src(1).getImmediate(imm1))
1358             muls = 1;
1359          else if (si->src(0).getImmediate(imm1))
1360             muls = 0;
1361          else
1362             return;
1363 
1364          bld.setPosition(i, false);
1365          i->op = OP_MUL;
1366          i->setSrc(0, si->getSrc(!muls));
1367          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1368          break;
1369       case OP_SUB:
1370       case OP_ADD:
1371          int adds;
1372          if (isFloatType(si->dType))
1373             return;
1374          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1375             adds = 0;
1376          else if (si->src(1).getImmediate(imm1))
1377             adds = 1;
1378          else
1379             return;
1380          if (si->src(!adds).mod != Modifier(0))
1381             return;
1382          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1383 
1384          // This is more operations, but if one of x, y is an immediate, then
1385          // we can get a situation where (a) we can use ISCADD, or (b)
1386          // propagate the add bit into an indirect load.
1387          bld.setPosition(i, false);
1388          i->op = si->op;
1389          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1390          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1391                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1392                                      si->getSrc(!adds),
1393                                      bld.mkImm(imm0.reg.data.u32)));
1394          break;
1395       default:
1396          return;
1397       }
1398    }
1399       break;
1400 
1401    case OP_ABS:
1402    case OP_NEG:
1403    case OP_SAT:
1404    case OP_LG2:
1405    case OP_RCP:
1406    case OP_SQRT:
1407    case OP_RSQ:
1408    case OP_PRESIN:
1409    case OP_SIN:
1410    case OP_COS:
1411    case OP_PREEX2:
1412    case OP_EX2:
1413       unary(i, imm0);
1414       break;
1415    case OP_BFIND: {
1416       int32_t res;
1417       switch (i->dType) {
1418       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1419       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1420       default:
1421          return;
1422       }
1423       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1424          res = 31 - res;
1425       bld.setPosition(i, false); /* make sure bld is init'ed */
1426       i->setSrc(0, bld.mkImm(res));
1427       i->setSrc(1, NULL);
1428       i->op = OP_MOV;
1429       i->subOp = 0;
1430       break;
1431    }
1432    case OP_POPCNT: {
1433       // Only deal with 1-arg POPCNT here
1434       if (i->srcExists(1))
1435          break;
1436       uint32_t res = util_bitcount(imm0.reg.data.u32);
1437       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1438       i->setSrc(1, NULL);
1439       i->op = OP_MOV;
1440       break;
1441    }
1442    case OP_CVT: {
1443       Storage res;
1444 
1445       // TODO: handle 64-bit values properly
1446       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1447          return;
1448 
1449       // TODO: handle single byte/word extractions
1450       if (i->subOp)
1451          return;
1452 
1453       bld.setPosition(i, true); /* make sure bld is init'ed */
1454 
1455 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1456    case type: \
1457       switch (i->sType) { \
1458       case TYPE_F64: \
1459          res.data.dst = util_iround(i->saturate ? \
1460                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1461                                     imm0.reg.data.f64); \
1462          break; \
1463       case TYPE_F32: \
1464          res.data.dst = util_iround(i->saturate ? \
1465                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1466                                     imm0.reg.data.f32); \
1467          break; \
1468       case TYPE_S32: \
1469          res.data.dst = i->saturate ? \
1470                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1471                         imm0.reg.data.s32; \
1472          break; \
1473       case TYPE_U32: \
1474          res.data.dst = i->saturate ? \
1475                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1476                         imm0.reg.data.u32; \
1477          break; \
1478       case TYPE_S16: \
1479          res.data.dst = i->saturate ? \
1480                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1481                         imm0.reg.data.s16; \
1482          break; \
1483       case TYPE_U16: \
1484          res.data.dst = i->saturate ? \
1485                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1486                         imm0.reg.data.u16; \
1487          break; \
1488       default: return; \
1489       } \
1490       i->setSrc(0, bld.mkImm(res.data.dst)); \
1491       break
1492 
1493       switch(i->dType) {
1494       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1495       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1496       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1497       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1498       case TYPE_F32:
1499          switch (i->sType) {
1500          case TYPE_F64:
1501             res.data.f32 = i->saturate ?
1502                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1503                imm0.reg.data.f64;
1504             break;
1505          case TYPE_F32:
1506             res.data.f32 = i->saturate ?
1507                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1508                imm0.reg.data.f32;
1509             break;
1510          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1511          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1512          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1513          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1514          default:
1515             return;
1516          }
1517          i->setSrc(0, bld.mkImm(res.data.f32));
1518          break;
1519       case TYPE_F64:
1520          switch (i->sType) {
1521          case TYPE_F64:
1522             res.data.f64 = i->saturate ?
1523                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1524                imm0.reg.data.f64;
1525             break;
1526          case TYPE_F32:
1527             res.data.f64 = i->saturate ?
1528                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1529                imm0.reg.data.f32;
1530             break;
1531          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1532          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1533          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1534          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1535          default:
1536             return;
1537          }
1538          i->setSrc(0, bld.mkImm(res.data.f64));
1539          break;
1540       default:
1541          return;
1542       }
1543 #undef CASE
1544 
1545       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1546       i->op = OP_MOV;
1547       i->saturate = 0;
1548       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1549       break;
1550    }
1551    default:
1552       return;
1553    }
1554 
1555    // This can get left behind some of the optimizations which simplify
1556    // saturatable values.
1557    if (newi->op == OP_MOV && newi->saturate) {
1558       ImmediateValue tmp;
1559       newi->saturate = 0;
1560       newi->op = OP_SAT;
1561       if (newi->src(0).getImmediate(tmp))
1562          unary(newi, tmp);
1563    }
1564 
1565    if (newi->op != op)
1566       foldCount++;
1567 }
1568 
1569 // =============================================================================
1570 
1571 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1572 class ModifierFolding : public Pass
1573 {
1574 private:
1575    virtual bool visit(BasicBlock *);
1576 };
1577 
1578 bool
visit(BasicBlock * bb)1579 ModifierFolding::visit(BasicBlock *bb)
1580 {
1581    const Target *target = prog->getTarget();
1582 
1583    Instruction *i, *next, *mi;
1584    Modifier mod;
1585 
1586    for (i = bb->getEntry(); i; i = next) {
1587       next = i->next;
1588 
1589       if (0 && i->op == OP_SUB) {
1590          // turn "sub" into "add neg" (do we really want this ?)
1591          i->op = OP_ADD;
1592          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1593       }
1594 
1595       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1596          mi = i->getSrc(s)->getInsn();
1597          if (!mi ||
1598              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1599             continue;
1600          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1601             if ((i->op != OP_ADD &&
1602                  i->op != OP_MUL) ||
1603                 (mi->op != OP_ABS &&
1604                  mi->op != OP_NEG))
1605                continue;
1606          } else
1607          if (i->sType != mi->dType) {
1608             continue;
1609          }
1610          if ((mod = Modifier(mi->op)) == Modifier(0))
1611             continue;
1612          mod *= mi->src(0).mod;
1613 
1614          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1615             // abs neg [abs] = abs
1616             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1617          } else
1618          if ((i->op == OP_NEG) && mod.neg()) {
1619             assert(s == 0);
1620             // neg as both opcode and modifier on same insn is prohibited
1621             // neg neg abs = abs, neg neg = identity
1622             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1623             i->op = mod.getOp();
1624             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1625             if (mod == Modifier(0))
1626                i->op = OP_MOV;
1627          }
1628 
1629          if (target->isModSupported(i, s, mod)) {
1630             i->setSrc(s, mi->getSrc(0));
1631             i->src(s).mod *= mod;
1632          }
1633       }
1634 
1635       if (i->op == OP_SAT) {
1636          mi = i->getSrc(0)->getInsn();
1637          if (mi &&
1638              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1639             mi->saturate = 1;
1640             mi->setDef(0, i->getDef(0));
1641             delete_Instruction(prog, i);
1642          }
1643       }
1644    }
1645 
1646    return true;
1647 }
1648 
1649 // =============================================================================
1650 
1651 // MUL + ADD -> MAD/FMA
1652 // MIN/MAX(a, a) -> a, etc.
1653 // SLCT(a, b, const) -> cc(const) ? a : b
1654 // RCP(RCP(a)) -> a
1655 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1656 class AlgebraicOpt : public Pass
1657 {
1658 private:
1659    virtual bool visit(BasicBlock *);
1660 
1661    void handleABS(Instruction *);
1662    bool handleADD(Instruction *);
1663    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1664    void handleMINMAX(Instruction *);
1665    void handleRCP(Instruction *);
1666    void handleSLCT(Instruction *);
1667    void handleLOGOP(Instruction *);
1668    void handleCVT_NEG(Instruction *);
1669    void handleCVT_CVT(Instruction *);
1670    void handleCVT_EXTBF(Instruction *);
1671    void handleSUCLAMP(Instruction *);
1672    void handleNEG(Instruction *);
1673 
1674    BuildUtil bld;
1675 };
1676 
1677 void
handleABS(Instruction * abs)1678 AlgebraicOpt::handleABS(Instruction *abs)
1679 {
1680    Instruction *sub = abs->getSrc(0)->getInsn();
1681    DataType ty;
1682    if (!sub ||
1683        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1684       return;
1685    // expect not to have mods yet, if we do, bail
1686    if (sub->src(0).mod || sub->src(1).mod)
1687       return;
1688    // hidden conversion ?
1689    ty = intTypeToSigned(sub->dType);
1690    if (abs->dType != abs->sType || ty != abs->sType)
1691       return;
1692 
1693    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1694        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1695        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1696          return;
1697 
1698    Value *src0 = sub->getSrc(0);
1699    Value *src1 = sub->getSrc(1);
1700 
1701    if (sub->op == OP_ADD) {
1702       Instruction *neg = sub->getSrc(1)->getInsn();
1703       if (neg && neg->op != OP_NEG) {
1704          neg = sub->getSrc(0)->getInsn();
1705          src0 = sub->getSrc(1);
1706       }
1707       if (!neg || neg->op != OP_NEG ||
1708           neg->dType != neg->sType || neg->sType != ty)
1709          return;
1710       src1 = neg->getSrc(0);
1711    }
1712 
1713    // found ABS(SUB))
1714    abs->moveSources(1, 2); // move sources >=1 up by 2
1715    abs->op = OP_SAD;
1716    abs->setType(sub->dType);
1717    abs->setSrc(0, src0);
1718    abs->setSrc(1, src1);
1719    bld.setPosition(abs, false);
1720    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1721 }
1722 
1723 bool
handleADD(Instruction * add)1724 AlgebraicOpt::handleADD(Instruction *add)
1725 {
1726    Value *src0 = add->getSrc(0);
1727    Value *src1 = add->getSrc(1);
1728 
1729    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1730       return false;
1731 
1732    bool changed = false;
1733    // we can't optimize to MAD if the add is precise
1734    if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1735       changed = tryADDToMADOrSAD(add, OP_MAD);
1736    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1737       changed = tryADDToMADOrSAD(add, OP_SAD);
1738    return changed;
1739 }
1740 
1741 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1742 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1743 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)1744 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1745 {
1746    Value *src0 = add->getSrc(0);
1747    Value *src1 = add->getSrc(1);
1748    Value *src;
1749    int s;
1750    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1751    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1752    Modifier mod[4];
1753 
1754    if (src0->refCount() == 1 &&
1755        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1756       s = 0;
1757    else
1758    if (src1->refCount() == 1 &&
1759        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1760       s = 1;
1761    else
1762       return false;
1763 
1764    src = add->getSrc(s);
1765 
1766    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1767       return false;
1768 
1769    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1770        src->getInsn()->dnz || src->getInsn()->precise)
1771       return false;
1772 
1773    if (toOp == OP_SAD) {
1774       ImmediateValue imm;
1775       if (!src->getInsn()->src(2).getImmediate(imm))
1776          return false;
1777       if (!imm.isInteger(0))
1778          return false;
1779    }
1780 
1781    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1782        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1783       return false;
1784 
1785    mod[0] = add->src(0).mod;
1786    mod[1] = add->src(1).mod;
1787    mod[2] = src->getUniqueInsn()->src(0).mod;
1788    mod[3] = src->getUniqueInsn()->src(1).mod;
1789 
1790    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1791       return false;
1792 
1793    add->op = toOp;
1794    add->subOp = src->getInsn()->subOp; // potentially mul-high
1795    add->dnz = src->getInsn()->dnz;
1796    add->dType = src->getInsn()->dType; // sign matters for imad hi
1797    add->sType = src->getInsn()->sType;
1798 
1799    add->setSrc(2, add->src(s ? 0 : 1));
1800 
1801    add->setSrc(0, src->getInsn()->getSrc(0));
1802    add->src(0).mod = mod[2] ^ mod[s];
1803    add->setSrc(1, src->getInsn()->getSrc(1));
1804    add->src(1).mod = mod[3];
1805 
1806    return true;
1807 }
1808 
1809 void
handleMINMAX(Instruction * minmax)1810 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1811 {
1812    Value *src0 = minmax->getSrc(0);
1813    Value *src1 = minmax->getSrc(1);
1814 
1815    if (src0 != src1 || src0->reg.file != FILE_GPR)
1816       return;
1817    if (minmax->src(0).mod == minmax->src(1).mod) {
1818       if (minmax->def(0).mayReplace(minmax->src(0))) {
1819          minmax->def(0).replace(minmax->src(0), false);
1820          minmax->bb->remove(minmax);
1821       } else {
1822          minmax->op = OP_CVT;
1823          minmax->setSrc(1, NULL);
1824       }
1825    } else {
1826       // TODO:
1827       // min(x, -x) = -abs(x)
1828       // min(x, -abs(x)) = -abs(x)
1829       // min(x, abs(x)) = x
1830       // max(x, -abs(x)) = x
1831       // max(x, abs(x)) = abs(x)
1832       // max(x, -x) = abs(x)
1833    }
1834 }
1835 
1836 void
handleRCP(Instruction * rcp)1837 AlgebraicOpt::handleRCP(Instruction *rcp)
1838 {
1839    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1840 
1841    if (si && si->op == OP_RCP) {
1842       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1843       rcp->op = mod.getOp();
1844       rcp->setSrc(0, si->getSrc(0));
1845    }
1846 }
1847 
1848 void
handleSLCT(Instruction * slct)1849 AlgebraicOpt::handleSLCT(Instruction *slct)
1850 {
1851    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1852       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1853          slct->setSrc(0, slct->getSrc(1));
1854    } else
1855    if (slct->getSrc(0) != slct->getSrc(1)) {
1856       return;
1857    }
1858    slct->op = OP_MOV;
1859    slct->setSrc(1, NULL);
1860    slct->setSrc(2, NULL);
1861 }
1862 
1863 void
handleLOGOP(Instruction * logop)1864 AlgebraicOpt::handleLOGOP(Instruction *logop)
1865 {
1866    Value *src0 = logop->getSrc(0);
1867    Value *src1 = logop->getSrc(1);
1868 
1869    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1870       return;
1871 
1872    if (src0 == src1) {
1873       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1874           logop->def(0).mayReplace(logop->src(0))) {
1875          logop->def(0).replace(logop->src(0), false);
1876          delete_Instruction(prog, logop);
1877       }
1878    } else {
1879       // try AND(SET, SET) -> SET_AND(SET)
1880       Instruction *set0 = src0->getInsn();
1881       Instruction *set1 = src1->getInsn();
1882 
1883       if (!set0 || set0->fixed || !set1 || set1->fixed)
1884          return;
1885       if (set1->op != OP_SET) {
1886          Instruction *xchg = set0;
1887          set0 = set1;
1888          set1 = xchg;
1889          if (set1->op != OP_SET)
1890             return;
1891       }
1892       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1893                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1894       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1895          return;
1896       if (set0->op != OP_SET &&
1897           set0->op != OP_SET_AND &&
1898           set0->op != OP_SET_OR &&
1899           set0->op != OP_SET_XOR)
1900          return;
1901       if (set0->getDef(0)->refCount() > 1 &&
1902           set1->getDef(0)->refCount() > 1)
1903          return;
1904       if (set0->getPredicate() || set1->getPredicate())
1905          return;
1906       // check that they don't source each other
1907       for (int s = 0; s < 2; ++s)
1908          if (set0->getSrc(s) == set1->getDef(0) ||
1909              set1->getSrc(s) == set0->getDef(0))
1910             return;
1911 
1912       set0 = cloneForward(func, set0);
1913       set1 = cloneShallow(func, set1);
1914       logop->bb->insertAfter(logop, set1);
1915       logop->bb->insertAfter(logop, set0);
1916 
1917       set0->dType = TYPE_U8;
1918       set0->getDef(0)->reg.file = FILE_PREDICATE;
1919       set0->getDef(0)->reg.size = 1;
1920       set1->setSrc(2, set0->getDef(0));
1921       set1->op = redOp;
1922       set1->setDef(0, logop->getDef(0));
1923       delete_Instruction(prog, logop);
1924    }
1925 }
1926 
1927 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1928 // nv50:
1929 //  F2I(NEG(I2F(ABS(SET))))
1930 void
handleCVT_NEG(Instruction * cvt)1931 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
1932 {
1933    Instruction *insn = cvt->getSrc(0)->getInsn();
1934    if (cvt->sType != TYPE_F32 ||
1935        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1936       return;
1937    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1938       return;
1939    if (insn->src(0).mod != Modifier(0))
1940       return;
1941    insn = insn->getSrc(0)->getInsn();
1942 
1943    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1944    if (insn && insn->op == OP_CVT &&
1945        insn->dType == TYPE_F32 &&
1946        insn->sType == TYPE_S32) {
1947       insn = insn->getSrc(0)->getInsn();
1948       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1949           insn->src(0).mod)
1950          return;
1951       insn = insn->getSrc(0)->getInsn();
1952       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1953          return;
1954    } else
1955    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1956       return;
1957    }
1958 
1959    Instruction *bset = cloneShallow(func, insn);
1960    bset->dType = TYPE_U32;
1961    bset->setDef(0, cvt->getDef(0));
1962    cvt->bb->insertAfter(cvt, bset);
1963    delete_Instruction(prog, cvt);
1964 }
1965 
1966 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
1967 // does a type conversion, this becomes trickier as there might be range
1968 // changes/etc. We could handle those in theory as long as the range was being
1969 // reduced or kept the same.
1970 void
handleCVT_CVT(Instruction * cvt)1971 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
1972 {
1973    Instruction *insn = cvt->getSrc(0)->getInsn();
1974    RoundMode rnd = insn->rnd;
1975 
1976    if (insn->saturate ||
1977        insn->subOp ||
1978        insn->dType != insn->sType ||
1979        insn->dType != cvt->sType)
1980       return;
1981 
1982    switch (insn->op) {
1983    case OP_CEIL:
1984       rnd = ROUND_PI;
1985       break;
1986    case OP_FLOOR:
1987       rnd = ROUND_MI;
1988       break;
1989    case OP_TRUNC:
1990       rnd = ROUND_ZI;
1991       break;
1992    case OP_CVT:
1993       break;
1994    default:
1995       return;
1996    }
1997 
1998    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
1999       rnd = (RoundMode)(rnd & 3);
2000 
2001    cvt->rnd = rnd;
2002    cvt->setSrc(0, insn->getSrc(0));
2003    cvt->src(0).mod *= insn->src(0).mod;
2004    cvt->sType = insn->sType;
2005 }
2006 
2007 // Some shaders extract packed bytes out of words and convert them to
2008 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2009 // nv50 for word sizes.
2010 //
2011 // CVT(EXTBF(x, byte/word))
2012 // CVT(AND(bytemask, x))
2013 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2014 // CVT(SHR(x, 16/24))
2015 void
handleCVT_EXTBF(Instruction * cvt)2016 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
2017 {
2018    Instruction *insn = cvt->getSrc(0)->getInsn();
2019    ImmediateValue imm;
2020    Value *arg = NULL;
2021    unsigned width, offset;
2022    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
2023       return;
2024    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
2025       width = (imm.reg.data.u32 >> 8) & 0xff;
2026       offset = imm.reg.data.u32 & 0xff;
2027       arg = insn->getSrc(0);
2028 
2029       if (width != 8 && width != 16)
2030          return;
2031       if (width == 8 && offset & 0x7)
2032          return;
2033       if (width == 16 && offset & 0xf)
2034          return;
2035    } else if (insn->op == OP_AND) {
2036       int s;
2037       if (insn->src(0).getImmediate(imm))
2038          s = 0;
2039       else if (insn->src(1).getImmediate(imm))
2040          s = 1;
2041       else
2042          return;
2043 
2044       if (imm.reg.data.u32 == 0xff)
2045          width = 8;
2046       else if (imm.reg.data.u32 == 0xffff)
2047          width = 16;
2048       else
2049          return;
2050 
2051       arg = insn->getSrc(!s);
2052       Instruction *shift = arg->getInsn();
2053       offset = 0;
2054       if (shift && shift->op == OP_SHR &&
2055           shift->sType == cvt->sType &&
2056           shift->src(1).getImmediate(imm) &&
2057           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2058            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
2059          arg = shift->getSrc(0);
2060          offset = imm.reg.data.u32;
2061       }
2062       // We just AND'd the high bits away, which means this is effectively an
2063       // unsigned value.
2064       cvt->sType = TYPE_U32;
2065    } else if (insn->op == OP_SHR &&
2066               insn->sType == cvt->sType &&
2067               insn->src(1).getImmediate(imm)) {
2068       arg = insn->getSrc(0);
2069       if (imm.reg.data.u32 == 24) {
2070          width = 8;
2071          offset = 24;
2072       } else if (imm.reg.data.u32 == 16) {
2073          width = 16;
2074          offset = 16;
2075       } else {
2076          return;
2077       }
2078    }
2079 
2080    if (!arg)
2081       return;
2082 
2083    // Irrespective of what came earlier, we can undo a shift on the argument
2084    // by adjusting the offset.
2085    Instruction *shift = arg->getInsn();
2086    if (shift && shift->op == OP_SHL &&
2087        shift->src(1).getImmediate(imm) &&
2088        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2089         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2090        imm.reg.data.u32 <= offset) {
2091       arg = shift->getSrc(0);
2092       offset -= imm.reg.data.u32;
2093    }
2094 
2095    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2096    // annoying to detect them.
2097 
2098    if (width == 8) {
2099       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2100    } else {
2101       assert(width == 16);
2102       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2103    }
2104    cvt->setSrc(0, arg);
2105    cvt->subOp = offset >> 3;
2106 }
2107 
2108 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2109 void
handleSUCLAMP(Instruction * insn)2110 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2111 {
2112    ImmediateValue imm;
2113    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2114    int s;
2115    Instruction *add;
2116 
2117    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2118 
2119    // look for ADD (TODO: only count references by non-SUCLAMP)
2120    if (insn->getSrc(0)->refCount() > 1)
2121       return;
2122    add = insn->getSrc(0)->getInsn();
2123    if (!add || add->op != OP_ADD ||
2124        (add->dType != TYPE_U32 &&
2125         add->dType != TYPE_S32))
2126       return;
2127 
2128    // look for immediate
2129    for (s = 0; s < 2; ++s)
2130       if (add->src(s).getImmediate(imm))
2131          break;
2132    if (s >= 2)
2133       return;
2134    s = s ? 0 : 1;
2135    // determine if immediate fits
2136    val += imm.reg.data.s32;
2137    if (val > 31 || val < -32)
2138       return;
2139    // determine if other addend fits
2140    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2141       return;
2142 
2143    bld.setPosition(insn, false); // make sure bld is init'ed
2144    // replace sources
2145    insn->setSrc(2, bld.mkImm(val));
2146    insn->setSrc(0, add->getSrc(s));
2147 }
2148 
2149 // NEG(AND(SET, 1)) -> SET
2150 void
handleNEG(Instruction * i)2151 AlgebraicOpt::handleNEG(Instruction *i) {
2152    Instruction *src = i->getSrc(0)->getInsn();
2153    ImmediateValue imm;
2154    int b;
2155 
2156    if (isFloatType(i->sType) || !src || src->op != OP_AND)
2157       return;
2158 
2159    if (src->src(0).getImmediate(imm))
2160       b = 1;
2161    else if (src->src(1).getImmediate(imm))
2162       b = 0;
2163    else
2164       return;
2165 
2166    if (!imm.isInteger(1))
2167       return;
2168 
2169    Instruction *set = src->getSrc(b)->getInsn();
2170    if ((set->op == OP_SET || set->op == OP_SET_AND ||
2171        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2172        !isFloatType(set->dType)) {
2173       i->def(0).replace(set->getDef(0), false);
2174    }
2175 }
2176 
2177 bool
visit(BasicBlock * bb)2178 AlgebraicOpt::visit(BasicBlock *bb)
2179 {
2180    Instruction *next;
2181    for (Instruction *i = bb->getEntry(); i; i = next) {
2182       next = i->next;
2183       switch (i->op) {
2184       case OP_ABS:
2185          handleABS(i);
2186          break;
2187       case OP_ADD:
2188          handleADD(i);
2189          break;
2190       case OP_RCP:
2191          handleRCP(i);
2192          break;
2193       case OP_MIN:
2194       case OP_MAX:
2195          handleMINMAX(i);
2196          break;
2197       case OP_SLCT:
2198          handleSLCT(i);
2199          break;
2200       case OP_AND:
2201       case OP_OR:
2202       case OP_XOR:
2203          handleLOGOP(i);
2204          break;
2205       case OP_CVT:
2206          handleCVT_NEG(i);
2207          handleCVT_CVT(i);
2208          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2209              handleCVT_EXTBF(i);
2210          break;
2211       case OP_SUCLAMP:
2212          handleSUCLAMP(i);
2213          break;
2214       case OP_NEG:
2215          handleNEG(i);
2216          break;
2217       default:
2218          break;
2219       }
2220    }
2221 
2222    return true;
2223 }
2224 
2225 // =============================================================================
2226 
2227 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2228 class LateAlgebraicOpt : public Pass
2229 {
2230 private:
2231    virtual bool visit(Instruction *);
2232 
2233    void handleADD(Instruction *);
2234    bool tryADDToSHLADD(Instruction *);
2235 };
2236 
2237 void
handleADD(Instruction * add)2238 LateAlgebraicOpt::handleADD(Instruction *add)
2239 {
2240    Value *src0 = add->getSrc(0);
2241    Value *src1 = add->getSrc(1);
2242 
2243    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2244       return;
2245 
2246    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2247       tryADDToSHLADD(add);
2248 }
2249 
2250 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2251 bool
tryADDToSHLADD(Instruction * add)2252 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2253 {
2254    Value *src0 = add->getSrc(0);
2255    Value *src1 = add->getSrc(1);
2256    ImmediateValue imm;
2257    Instruction *shl;
2258    Value *src;
2259    int s;
2260 
2261    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2262        || isFloatType(add->dType))
2263       return false;
2264 
2265    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2266       s = 0;
2267    else
2268    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2269       s = 1;
2270    else
2271       return false;
2272 
2273    src = add->getSrc(s);
2274    shl = src->getUniqueInsn();
2275 
2276    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2277       return false;
2278 
2279    if (!shl->src(1).getImmediate(imm))
2280       return false;
2281 
2282    add->op = OP_SHLADD;
2283    add->setSrc(2, add->src(!s));
2284    // SHL can't have any modifiers, but the ADD source may have had
2285    // one. Preserve it.
2286    add->setSrc(0, shl->getSrc(0));
2287    if (s == 1)
2288       add->src(0).mod = add->src(1).mod;
2289    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2290    add->src(1).mod = Modifier(0);
2291 
2292    return true;
2293 }
2294 
2295 bool
visit(Instruction * i)2296 LateAlgebraicOpt::visit(Instruction *i)
2297 {
2298    switch (i->op) {
2299    case OP_ADD:
2300       handleADD(i);
2301       break;
2302    default:
2303       break;
2304    }
2305 
2306    return true;
2307 }
2308 
2309 // =============================================================================
2310 
2311 // Split 64-bit MUL and MAD
2312 class Split64BitOpPreRA : public Pass
2313 {
2314 private:
2315    virtual bool visit(BasicBlock *);
2316    void split64MulMad(Function *, Instruction *, DataType);
2317 
2318    BuildUtil bld;
2319 };
2320 
2321 bool
visit(BasicBlock * bb)2322 Split64BitOpPreRA::visit(BasicBlock *bb)
2323 {
2324    Instruction *i, *next;
2325    Modifier mod;
2326 
2327    for (i = bb->getEntry(); i; i = next) {
2328       next = i->next;
2329 
2330       DataType hTy;
2331       switch (i->dType) {
2332       case TYPE_U64: hTy = TYPE_U32; break;
2333       case TYPE_S64: hTy = TYPE_S32; break;
2334       default:
2335          continue;
2336       }
2337 
2338       if (i->op == OP_MAD || i->op == OP_MUL)
2339          split64MulMad(func, i, hTy);
2340    }
2341 
2342    return true;
2343 }
2344 
2345 void
split64MulMad(Function * fn,Instruction * i,DataType hTy)2346 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
2347 {
2348    assert(i->op == OP_MAD || i->op == OP_MUL);
2349    assert(!isFloatType(i->dType) && !isFloatType(i->sType));
2350    assert(typeSizeof(hTy) == 4);
2351 
2352    bld.setPosition(i, true);
2353 
2354    Value *zero = bld.mkImm(0u);
2355    Value *carry = bld.getSSA(1, FILE_FLAGS);
2356 
2357    // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2358    // values (a, b and c might be 32-bit values), using 32-bit operations. This
2359    // gives the following operations:
2360    // * `d.low = low(a.low * b.low) (+ c.low)?`
2361    // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2362    //           + high(a.low * b.low) (+ c.high)?`
2363    //
2364    // To compute the high bits, we can split in the following operations:
2365    // * `tmp1   = low(a.high * b.low) (+ c.high)?`
2366    // * `tmp2   = low(a.low * b.high) + tmp1`
2367    // * `d.high = high(a.low * b.low) + tmp2`
2368    //
2369    // mkSplit put lower bits at index 0 and higher bits at index 1
2370 
2371    Value *op1[2];
2372    if (i->getSrc(0)->reg.size == 8)
2373       bld.mkSplit(op1, 4, i->getSrc(0));
2374    else {
2375       op1[0] = i->getSrc(0);
2376       op1[1] = zero;
2377    }
2378    Value *op2[2];
2379    if (i->getSrc(1)->reg.size == 8)
2380       bld.mkSplit(op2, 4, i->getSrc(1));
2381    else {
2382       op2[0] = i->getSrc(1);
2383       op2[1] = zero;
2384    }
2385 
2386    Value *op3[2] = { NULL, NULL };
2387    if (i->op == OP_MAD) {
2388       if (i->getSrc(2)->reg.size == 8)
2389          bld.mkSplit(op3, 4, i->getSrc(2));
2390       else {
2391          op3[0] = i->getSrc(2);
2392          op3[1] = zero;
2393       }
2394    }
2395 
2396    Value *tmpRes1Hi = bld.getSSA();
2397    if (i->op == OP_MAD)
2398       bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
2399    else
2400       bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
2401 
2402    Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
2403 
2404    Value *def[2] = { bld.getSSA(), bld.getSSA() };
2405 
2406    // If it was a MAD, add the carry from the low bits
2407    // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2408    // d.high
2409    if (i->op == OP_MAD)
2410       bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
2411    else
2412       bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
2413 
2414    Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
2415    hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
2416    if (i->op == OP_MAD)
2417       hiPart3->setFlagsSrc(3, carry);
2418 
2419    bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
2420 
2421    delete_Instruction(fn->getProgram(), i);
2422 }
2423 
2424 // =============================================================================
2425 
2426 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)2427 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2428 {
2429    if (offset != ldst->getSrc(0)->reg.data.offset) {
2430       if (ldst->getSrc(0)->refCount() > 1)
2431          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2432       ldst->getSrc(0)->reg.data.offset = offset;
2433    }
2434 }
2435 
2436 // Combine loads and stores, forward stores to loads where possible.
2437 class MemoryOpt : public Pass
2438 {
2439 private:
2440    class Record
2441    {
2442    public:
2443       Record *next;
2444       Instruction *insn;
2445       const Value *rel[2];
2446       const Value *base;
2447       int32_t offset;
2448       int8_t fileIndex;
2449       uint8_t size;
2450       bool locked;
2451       Record *prev;
2452 
2453       bool overlaps(const Instruction *ldst) const;
2454 
2455       inline void link(Record **);
2456       inline void unlink(Record **);
2457       inline void set(const Instruction *ldst);
2458    };
2459 
2460 public:
2461    MemoryOpt();
2462 
2463    Record *loads[DATA_FILE_COUNT];
2464    Record *stores[DATA_FILE_COUNT];
2465 
2466    MemoryPool recordPool;
2467 
2468 private:
2469    virtual bool visit(BasicBlock *);
2470    bool runOpt(BasicBlock *);
2471 
2472    Record **getList(const Instruction *);
2473 
2474    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2475 
2476    // merge @insn into load/store instruction from @rec
2477    bool combineLd(Record *rec, Instruction *ld);
2478    bool combineSt(Record *rec, Instruction *st);
2479 
2480    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2481    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2482    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2483 
2484    void addRecord(Instruction *ldst);
2485    void purgeRecords(Instruction *const st, DataFile);
2486    void lockStores(Instruction *const ld);
2487    void reset();
2488 
2489 private:
2490    Record *prevRecord;
2491 };
2492 
MemoryOpt()2493 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2494 {
2495    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2496       loads[i] = NULL;
2497       stores[i] = NULL;
2498    }
2499    prevRecord = NULL;
2500 }
2501 
2502 void
reset()2503 MemoryOpt::reset()
2504 {
2505    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2506       Record *it, *next;
2507       for (it = loads[i]; it; it = next) {
2508          next = it->next;
2509          recordPool.release(it);
2510       }
2511       loads[i] = NULL;
2512       for (it = stores[i]; it; it = next) {
2513          next = it->next;
2514          recordPool.release(it);
2515       }
2516       stores[i] = NULL;
2517    }
2518 }
2519 
2520 bool
combineLd(Record * rec,Instruction * ld)2521 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2522 {
2523    int32_t offRc = rec->offset;
2524    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2525    int sizeRc = rec->size;
2526    int sizeLd = typeSizeof(ld->dType);
2527    int size = sizeRc + sizeLd;
2528    int d, j;
2529 
2530    if (!prog->getTarget()->
2531        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2532       return false;
2533    // no unaligned loads
2534    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2535        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2536       return false;
2537    // for compute indirect loads are not guaranteed to be aligned
2538    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2539       return false;
2540 
2541    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2542 
2543    // lock any stores that overlap with the load being merged into the
2544    // existing record.
2545    lockStores(ld);
2546 
2547    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2548 
2549    if (offLd < offRc) {
2550       int sz;
2551       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2552       // d: nr of definitions in ld
2553       // j: nr of definitions in rec->insn, move:
2554       for (d = d + j - 1; j > 0; --j, --d)
2555          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2556 
2557       if (rec->insn->getSrc(0)->refCount() > 1)
2558          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2559       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2560 
2561       d = 0;
2562    } else {
2563       d = j;
2564    }
2565    // move definitions of @ld to @rec->insn
2566    for (j = 0; sizeLd; ++j, ++d) {
2567       sizeLd -= ld->getDef(j)->reg.size;
2568       rec->insn->setDef(d, ld->getDef(j));
2569    }
2570 
2571    rec->size = size;
2572    rec->insn->getSrc(0)->reg.size = size;
2573    rec->insn->setType(typeOfSize(size));
2574 
2575    delete_Instruction(prog, ld);
2576 
2577    return true;
2578 }
2579 
2580 bool
combineSt(Record * rec,Instruction * st)2581 MemoryOpt::combineSt(Record *rec, Instruction *st)
2582 {
2583    int32_t offRc = rec->offset;
2584    int32_t offSt = st->getSrc(0)->reg.data.offset;
2585    int sizeRc = rec->size;
2586    int sizeSt = typeSizeof(st->dType);
2587    int s = sizeSt / 4;
2588    int size = sizeRc + sizeSt;
2589    int j, k;
2590    Value *src[4]; // no modifiers in ValueRef allowed for st
2591    Value *extra[3];
2592 
2593    if (!prog->getTarget()->
2594        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2595       return false;
2596    // no unaligned stores
2597    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2598       return false;
2599    // for compute indirect stores are not guaranteed to be aligned
2600    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2601       return false;
2602 
2603    // remove any existing load/store records for the store being merged into
2604    // the existing record.
2605    purgeRecords(st, DATA_FILE_COUNT);
2606 
2607    st->takeExtraSources(0, extra); // save predicate and indirect address
2608 
2609    if (offRc < offSt) {
2610       // save values from @st
2611       for (s = 0; sizeSt; ++s) {
2612          sizeSt -= st->getSrc(s + 1)->reg.size;
2613          src[s] = st->getSrc(s + 1);
2614       }
2615       // set record's values as low sources of @st
2616       for (j = 1; sizeRc; ++j) {
2617          sizeRc -= rec->insn->getSrc(j)->reg.size;
2618          st->setSrc(j, rec->insn->getSrc(j));
2619       }
2620       // set saved values as high sources of @st
2621       for (k = j, j = 0; j < s; ++j)
2622          st->setSrc(k++, src[j]);
2623 
2624       updateLdStOffset(st, offRc, func);
2625    } else {
2626       for (j = 1; sizeSt; ++j)
2627          sizeSt -= st->getSrc(j)->reg.size;
2628       for (s = 1; sizeRc; ++j, ++s) {
2629          sizeRc -= rec->insn->getSrc(s)->reg.size;
2630          st->setSrc(j, rec->insn->getSrc(s));
2631       }
2632       rec->offset = offSt;
2633    }
2634    st->putExtraSources(0, extra); // restore pointer and predicate
2635 
2636    delete_Instruction(prog, rec->insn);
2637    rec->insn = st;
2638    rec->size = size;
2639    rec->insn->getSrc(0)->reg.size = size;
2640    rec->insn->setType(typeOfSize(size));
2641    return true;
2642 }
2643 
2644 void
set(const Instruction * ldst)2645 MemoryOpt::Record::set(const Instruction *ldst)
2646 {
2647    const Symbol *mem = ldst->getSrc(0)->asSym();
2648    fileIndex = mem->reg.fileIndex;
2649    rel[0] = ldst->getIndirect(0, 0);
2650    rel[1] = ldst->getIndirect(0, 1);
2651    offset = mem->reg.data.offset;
2652    base = mem->getBase();
2653    size = typeSizeof(ldst->sType);
2654 }
2655 
2656 void
link(Record ** list)2657 MemoryOpt::Record::link(Record **list)
2658 {
2659    next = *list;
2660    if (next)
2661       next->prev = this;
2662    prev = NULL;
2663    *list = this;
2664 }
2665 
2666 void
unlink(Record ** list)2667 MemoryOpt::Record::unlink(Record **list)
2668 {
2669    if (next)
2670       next->prev = prev;
2671    if (prev)
2672       prev->next = next;
2673    else
2674       *list = next;
2675 }
2676 
2677 MemoryOpt::Record **
getList(const Instruction * insn)2678 MemoryOpt::getList(const Instruction *insn)
2679 {
2680    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2681       return &loads[insn->src(0).getFile()];
2682    return &stores[insn->src(0).getFile()];
2683 }
2684 
2685 void
addRecord(Instruction * i)2686 MemoryOpt::addRecord(Instruction *i)
2687 {
2688    Record **list = getList(i);
2689    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2690 
2691    it->link(list);
2692    it->set(i);
2693    it->insn = i;
2694    it->locked = false;
2695 }
2696 
2697 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const2698 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2699 {
2700    const Symbol *sym = insn->getSrc(0)->asSym();
2701    const int size = typeSizeof(insn->sType);
2702    Record *rec = NULL;
2703    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2704 
2705    for (; it; it = it->next) {
2706       if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
2707          continue;
2708       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2709           it->rel[0] != insn->getIndirect(0, 0) ||
2710           it->fileIndex != sym->reg.fileIndex ||
2711           it->rel[1] != insn->getIndirect(0, 1))
2712          continue;
2713 
2714       if (it->offset < sym->reg.data.offset) {
2715          if (it->offset + it->size >= sym->reg.data.offset) {
2716             isAdj = (it->offset + it->size == sym->reg.data.offset);
2717             if (!isAdj)
2718                return it;
2719             if (!(it->offset & 0x7))
2720                rec = it;
2721          }
2722       } else {
2723          isAdj = it->offset != sym->reg.data.offset;
2724          if (size <= it->size && !isAdj)
2725             return it;
2726          else
2727          if (!(sym->reg.data.offset & 0x7))
2728             if (it->offset - size <= sym->reg.data.offset)
2729                rec = it;
2730       }
2731    }
2732    return rec;
2733 }
2734 
2735 bool
replaceLdFromSt(Instruction * ld,Record * rec)2736 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2737 {
2738    Instruction *st = rec->insn;
2739    int32_t offSt = rec->offset;
2740    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2741    int d, s;
2742 
2743    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2744       offSt += st->getSrc(s)->reg.size;
2745    if (offSt != offLd)
2746       return false;
2747 
2748    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2749       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2750          return false;
2751       if (st->getSrc(s)->reg.file != FILE_GPR)
2752          return false;
2753       ld->def(d).replace(st->src(s), false);
2754    }
2755    ld->bb->remove(ld);
2756    return true;
2757 }
2758 
2759 bool
replaceLdFromLd(Instruction * ldE,Record * rec)2760 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2761 {
2762    Instruction *ldR = rec->insn;
2763    int32_t offR = rec->offset;
2764    int32_t offE = ldE->getSrc(0)->reg.data.offset;
2765    int dR, dE;
2766 
2767    assert(offR <= offE);
2768    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2769       offR += ldR->getDef(dR)->reg.size;
2770    if (offR != offE)
2771       return false;
2772 
2773    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2774       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2775          return false;
2776       ldE->def(dE).replace(ldR->getDef(dR), false);
2777    }
2778 
2779    delete_Instruction(prog, ldE);
2780    return true;
2781 }
2782 
2783 bool
replaceStFromSt(Instruction * restrict st,Record * rec)2784 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2785 {
2786    const Instruction *const ri = rec->insn;
2787    Value *extra[3];
2788 
2789    int32_t offS = st->getSrc(0)->reg.data.offset;
2790    int32_t offR = rec->offset;
2791    int32_t endS = offS + typeSizeof(st->dType);
2792    int32_t endR = offR + typeSizeof(ri->dType);
2793 
2794    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2795 
2796    st->takeExtraSources(0, extra);
2797 
2798    if (offR < offS) {
2799       Value *vals[10];
2800       int s, n;
2801       int k = 0;
2802       // get non-replaced sources of ri
2803       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2804          vals[k++] = ri->getSrc(s);
2805       n = s;
2806       // get replaced sources of st
2807       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2808          vals[k++] = st->getSrc(s);
2809       // skip replaced sources of ri
2810       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2811       // get non-replaced sources after values covered by st
2812       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2813          vals[k++] = ri->getSrc(s);
2814       assert((unsigned int)k <= ARRAY_SIZE(vals));
2815       for (s = 0; s < k; ++s)
2816          st->setSrc(s + 1, vals[s]);
2817       st->setSrc(0, ri->getSrc(0));
2818    } else
2819    if (endR > endS) {
2820       int j, s;
2821       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
2822       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
2823       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
2824          st->setSrc(s++, ri->getSrc(j));
2825    }
2826    st->putExtraSources(0, extra);
2827 
2828    delete_Instruction(prog, rec->insn);
2829 
2830    rec->insn = st;
2831    rec->offset = st->getSrc(0)->reg.data.offset;
2832 
2833    st->setType(typeOfSize(rec->size));
2834 
2835    return true;
2836 }
2837 
2838 bool
overlaps(const Instruction * ldst) const2839 MemoryOpt::Record::overlaps(const Instruction *ldst) const
2840 {
2841    Record that;
2842    that.set(ldst);
2843 
2844    // This assumes that images/buffers can't overlap. They can.
2845    // TODO: Plumb the restrict logic through, and only skip when it's a
2846    // restrict situation, or there can implicitly be no writes.
2847    if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
2848       return false;
2849 
2850    if (this->rel[0] || that.rel[0])
2851       return this->base == that.base;
2852 
2853    return
2854       (this->offset < that.offset + that.size) &&
2855       (this->offset + this->size > that.offset);
2856 }
2857 
2858 // We must not eliminate stores that affect the result of @ld if
2859 // we find later stores to the same location, and we may no longer
2860 // merge them with later stores.
2861 // The stored value can, however, still be used to determine the value
2862 // returned by future loads.
2863 void
lockStores(Instruction * const ld)2864 MemoryOpt::lockStores(Instruction *const ld)
2865 {
2866    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
2867       if (!r->locked && r->overlaps(ld))
2868          r->locked = true;
2869 }
2870 
2871 // Prior loads from the location of @st are no longer valid.
2872 // Stores to the location of @st may no longer be used to derive
2873 // the value at it nor be coalesced into later stores.
2874 void
purgeRecords(Instruction * const st,DataFile f)2875 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
2876 {
2877    if (st)
2878       f = st->src(0).getFile();
2879 
2880    for (Record *r = loads[f]; r; r = r->next)
2881       if (!st || r->overlaps(st))
2882          r->unlink(&loads[f]);
2883 
2884    for (Record *r = stores[f]; r; r = r->next)
2885       if (!st || r->overlaps(st))
2886          r->unlink(&stores[f]);
2887 }
2888 
2889 bool
visit(BasicBlock * bb)2890 MemoryOpt::visit(BasicBlock *bb)
2891 {
2892    bool ret = runOpt(bb);
2893    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2894    // where 96 bit memory operations are forbidden.
2895    if (ret)
2896       ret = runOpt(bb);
2897    return ret;
2898 }
2899 
2900 bool
runOpt(BasicBlock * bb)2901 MemoryOpt::runOpt(BasicBlock *bb)
2902 {
2903    Instruction *ldst, *next;
2904    Record *rec;
2905    bool isAdjacent = true;
2906 
2907    for (ldst = bb->getEntry(); ldst; ldst = next) {
2908       bool keep = true;
2909       bool isLoad = true;
2910       next = ldst->next;
2911 
2912       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
2913          if (ldst->isDead()) {
2914             // might have been produced by earlier optimization
2915             delete_Instruction(prog, ldst);
2916             continue;
2917          }
2918       } else
2919       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
2920          if (typeSizeof(ldst->dType) == 4 &&
2921              ldst->src(1).getFile() == FILE_GPR &&
2922              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
2923             delete_Instruction(prog, ldst);
2924             continue;
2925          }
2926          isLoad = false;
2927       } else {
2928          // TODO: maybe have all fixed ops act as barrier ?
2929          if (ldst->op == OP_CALL ||
2930              ldst->op == OP_BAR ||
2931              ldst->op == OP_MEMBAR) {
2932             purgeRecords(NULL, FILE_MEMORY_LOCAL);
2933             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2934             purgeRecords(NULL, FILE_MEMORY_SHARED);
2935             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2936          } else
2937          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
2938             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
2939                purgeRecords(NULL, FILE_MEMORY_LOCAL);
2940                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2941                purgeRecords(NULL, FILE_MEMORY_SHARED);
2942             } else {
2943                purgeRecords(NULL, ldst->src(0).getFile());
2944             }
2945          } else
2946          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
2947             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2948          }
2949          continue;
2950       }
2951       if (ldst->getPredicate()) // TODO: handle predicated ld/st
2952          continue;
2953       if (ldst->perPatch) // TODO: create separate per-patch lists
2954          continue;
2955 
2956       if (isLoad) {
2957          DataFile file = ldst->src(0).getFile();
2958 
2959          // if ld l[]/g[] look for previous store to eliminate the reload
2960          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
2961             // TODO: shared memory ?
2962             rec = findRecord(ldst, false, isAdjacent);
2963             if (rec && !isAdjacent)
2964                keep = !replaceLdFromSt(ldst, rec);
2965          }
2966 
2967          // or look for ld from the same location and replace this one
2968          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
2969          if (rec) {
2970             if (!isAdjacent)
2971                keep = !replaceLdFromLd(ldst, rec);
2972             else
2973                // or combine a previous load with this one
2974                keep = !combineLd(rec, ldst);
2975          }
2976          if (keep)
2977             lockStores(ldst);
2978       } else {
2979          rec = findRecord(ldst, false, isAdjacent);
2980          if (rec) {
2981             if (!isAdjacent)
2982                keep = !replaceStFromSt(ldst, rec);
2983             else
2984                keep = !combineSt(rec, ldst);
2985          }
2986          if (keep)
2987             purgeRecords(ldst, DATA_FILE_COUNT);
2988       }
2989       if (keep)
2990          addRecord(ldst);
2991    }
2992    reset();
2993 
2994    return true;
2995 }
2996 
2997 // =============================================================================
2998 
2999 // Turn control flow into predicated instructions (after register allocation !).
3000 // TODO:
3001 // Could move this to before register allocation on NVC0 and also handle nested
3002 // constructs.
3003 class FlatteningPass : public Pass
3004 {
3005 private:
3006    virtual bool visit(Function *);
3007    virtual bool visit(BasicBlock *);
3008 
3009    bool tryPredicateConditional(BasicBlock *);
3010    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
3011    void tryPropagateBranch(BasicBlock *);
3012    inline bool isConstantCondition(Value *pred);
3013    inline bool mayPredicate(const Instruction *, const Value *pred) const;
3014    inline void removeFlow(Instruction *);
3015 
3016    uint8_t gpr_unit;
3017 };
3018 
3019 bool
isConstantCondition(Value * pred)3020 FlatteningPass::isConstantCondition(Value *pred)
3021 {
3022    Instruction *insn = pred->getUniqueInsn();
3023    assert(insn);
3024    if (insn->op != OP_SET || insn->srcExists(2))
3025       return false;
3026 
3027    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
3028       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
3029       DataFile file;
3030       if (ld) {
3031          if (ld->op != OP_MOV && ld->op != OP_LOAD)
3032             return false;
3033          if (ld->src(0).isIndirect(0))
3034             return false;
3035          file = ld->src(0).getFile();
3036       } else {
3037          file = insn->src(s).getFile();
3038          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3039          // in register "units", which can vary between targets.
3040          if (file == FILE_GPR) {
3041             Value *v = insn->getSrc(s);
3042             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
3043             int units = bytes >> gpr_unit;
3044             if (units > prog->maxGPR)
3045                file = FILE_IMMEDIATE;
3046          }
3047       }
3048       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
3049          return false;
3050    }
3051    return true;
3052 }
3053 
3054 void
removeFlow(Instruction * insn)3055 FlatteningPass::removeFlow(Instruction *insn)
3056 {
3057    FlowInstruction *term = insn ? insn->asFlow() : NULL;
3058    if (!term)
3059       return;
3060    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
3061 
3062    if (term->op == OP_BRA) {
3063       // TODO: this might get more difficult when we get arbitrary BRAs
3064       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
3065          return;
3066    } else
3067    if (term->op != OP_JOIN)
3068       return;
3069 
3070    Value *pred = term->getPredicate();
3071 
3072    delete_Instruction(prog, term);
3073 
3074    if (pred && pred->refCount() == 0) {
3075       Instruction *pSet = pred->getUniqueInsn();
3076       pred->join->reg.data.id = -1; // deallocate
3077       if (pSet->isDead())
3078          delete_Instruction(prog, pSet);
3079    }
3080 }
3081 
3082 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)3083 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
3084 {
3085    for (Instruction *i = bb->getEntry(); i; i = i->next) {
3086       if (i->isNop())
3087          continue;
3088       assert(!i->getPredicate());
3089       i->setPredicate(cc, pred);
3090    }
3091    removeFlow(bb->getExit());
3092 }
3093 
3094 bool
mayPredicate(const Instruction * insn,const Value * pred) const3095 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
3096 {
3097    if (insn->isPseudo())
3098       return true;
3099    // TODO: calls where we don't know which registers are modified
3100 
3101    if (!prog->getTarget()->mayPredicate(insn, pred))
3102       return false;
3103    for (int d = 0; insn->defExists(d); ++d)
3104       if (insn->getDef(d)->equals(pred))
3105          return false;
3106    return true;
3107 }
3108 
3109 // If we jump to BRA/RET/EXIT, replace the jump with it.
3110 // NOTE: We do not update the CFG anymore here !
3111 //
3112 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3113 //  BB:0
3114 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3115 //  BB1:
3116 //   bra BB:3
3117 //  BB2:
3118 //   ...
3119 //  BB3:
3120 //   ...
3121 void
tryPropagateBranch(BasicBlock * bb)3122 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
3123 {
3124    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
3125       BasicBlock *bf = i->asFlow()->target.bb;
3126 
3127       if (bf->getInsnCount() != 1)
3128          continue;
3129 
3130       FlowInstruction *bra = i->asFlow();
3131       FlowInstruction *rep = bf->getExit()->asFlow();
3132 
3133       if (!rep || rep->getPredicate())
3134          continue;
3135       if (rep->op != OP_BRA &&
3136           rep->op != OP_JOIN &&
3137           rep->op != OP_EXIT)
3138          continue;
3139 
3140       // TODO: If there are multiple branches to @rep, only the first would
3141       // be replaced, so only remove them after this pass is done ?
3142       // Also, need to check all incident blocks for fall-through exits and
3143       // add the branch there.
3144       bra->op = rep->op;
3145       bra->target.bb = rep->target.bb;
3146       if (bf->cfg.incidentCount() == 1)
3147          bf->remove(rep);
3148    }
3149 }
3150 
3151 bool
visit(Function * fn)3152 FlatteningPass::visit(Function *fn)
3153 {
3154    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
3155 
3156    return true;
3157 }
3158 
3159 bool
visit(BasicBlock * bb)3160 FlatteningPass::visit(BasicBlock *bb)
3161 {
3162    if (tryPredicateConditional(bb))
3163       return true;
3164 
3165    // try to attach join to previous instruction
3166    if (prog->getTarget()->hasJoin) {
3167       Instruction *insn = bb->getExit();
3168       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
3169          insn = insn->prev;
3170          if (insn && !insn->getPredicate() &&
3171              !insn->asFlow() &&
3172              insn->op != OP_DISCARD &&
3173              insn->op != OP_TEXBAR &&
3174              !isTextureOp(insn->op) && // probably just nve4
3175              !isSurfaceOp(insn->op) && // not confirmed
3176              insn->op != OP_LINTERP && // probably just nve4
3177              insn->op != OP_PINTERP && // probably just nve4
3178              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
3179               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
3180              !insn->isNop()) {
3181             insn->join = 1;
3182             bb->remove(bb->getExit());
3183             return true;
3184          }
3185       }
3186    }
3187 
3188    tryPropagateBranch(bb);
3189 
3190    return true;
3191 }
3192 
3193 bool
tryPredicateConditional(BasicBlock * bb)3194 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3195 {
3196    BasicBlock *bL = NULL, *bR = NULL;
3197    unsigned int nL = 0, nR = 0, limit = 12;
3198    Instruction *insn;
3199    unsigned int mask;
3200 
3201    mask = bb->initiatesSimpleConditional();
3202    if (!mask)
3203       return false;
3204 
3205    assert(bb->getExit());
3206    Value *pred = bb->getExit()->getPredicate();
3207    assert(pred);
3208 
3209    if (isConstantCondition(pred))
3210       limit = 4;
3211 
3212    Graph::EdgeIterator ei = bb->cfg.outgoing();
3213 
3214    if (mask & 1) {
3215       bL = BasicBlock::get(ei.getNode());
3216       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3217          if (!mayPredicate(insn, pred))
3218             return false;
3219       if (nL > limit)
3220          return false; // too long, do a real branch
3221    }
3222    ei.next();
3223 
3224    if (mask & 2) {
3225       bR = BasicBlock::get(ei.getNode());
3226       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3227          if (!mayPredicate(insn, pred))
3228             return false;
3229       if (nR > limit)
3230          return false; // too long, do a real branch
3231    }
3232 
3233    if (bL)
3234       predicateInstructions(bL, pred, bb->getExit()->cc);
3235    if (bR)
3236       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3237 
3238    if (bb->joinAt) {
3239       bb->remove(bb->joinAt);
3240       bb->joinAt = NULL;
3241    }
3242    removeFlow(bb->getExit()); // delete the branch/join at the fork point
3243 
3244    // remove potential join operations at the end of the conditional
3245    if (prog->getTarget()->joinAnterior) {
3246       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3247       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3248          removeFlow(bb->getEntry());
3249    }
3250 
3251    return true;
3252 }
3253 
3254 // =============================================================================
3255 
3256 // Fold Immediate into MAD; must be done after register allocation due to
3257 // constraint SDST == SSRC2
3258 // TODO:
3259 // Does NVC0+ have other situations where this pass makes sense?
3260 class PostRaLoadPropagation : public Pass
3261 {
3262 private:
3263    virtual bool visit(Instruction *);
3264 
3265    void handleMADforNV50(Instruction *);
3266    void handleMADforNVC0(Instruction *);
3267 };
3268 
3269 static bool
post_ra_dead(Instruction * i)3270 post_ra_dead(Instruction *i)
3271 {
3272    for (int d = 0; i->defExists(d); ++d)
3273       if (i->getDef(d)->refCount())
3274          return false;
3275    return true;
3276 }
3277 
3278 // Fold Immediate into MAD; must be done after register allocation due to
3279 // constraint SDST == SSRC2
3280 void
handleMADforNV50(Instruction * i)3281 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
3282 {
3283    if (i->def(0).getFile() != FILE_GPR ||
3284        i->src(0).getFile() != FILE_GPR ||
3285        i->src(1).getFile() != FILE_GPR ||
3286        i->src(2).getFile() != FILE_GPR ||
3287        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3288       return;
3289 
3290    if (i->getDef(0)->reg.data.id >= 64 ||
3291        i->getSrc(0)->reg.data.id >= 64)
3292       return;
3293 
3294    if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3295       return;
3296 
3297    if (i->getPredicate())
3298       return;
3299 
3300    Value *vtmp;
3301    Instruction *def = i->getSrc(1)->getInsn();
3302 
3303    if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3304       def = def->getSrc(0)->getInsn();
3305    if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3306       vtmp = i->getSrc(1);
3307       if (isFloatType(i->sType)) {
3308          i->setSrc(1, def->getSrc(0));
3309       } else {
3310          ImmediateValue val;
3311          // getImmediate() has side-effects on the argument so this *shouldn't*
3312          // be folded into the assert()
3313          MAYBE_UNUSED bool ret = def->src(0).getImmediate(val);
3314          assert(ret);
3315          if (i->getSrc(1)->reg.data.id & 1)
3316             val.reg.data.u32 >>= 16;
3317          val.reg.data.u32 &= 0xffff;
3318          i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
3319       }
3320 
3321       /* There's no post-RA dead code elimination, so do it here
3322        * XXX: if we add more code-removing post-RA passes, we might
3323        *      want to create a post-RA dead-code elim pass */
3324       if (post_ra_dead(vtmp->getInsn())) {
3325          Value *src = vtmp->getInsn()->getSrc(0);
3326          // Careful -- splits will have already been removed from the
3327          // functions. Don't double-delete.
3328          if (vtmp->getInsn()->bb)
3329             delete_Instruction(prog, vtmp->getInsn());
3330          if (src->getInsn() && post_ra_dead(src->getInsn()))
3331             delete_Instruction(prog, src->getInsn());
3332       }
3333    }
3334 }
3335 
3336 void
handleMADforNVC0(Instruction * i)3337 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
3338 {
3339    if (i->def(0).getFile() != FILE_GPR ||
3340        i->src(0).getFile() != FILE_GPR ||
3341        i->src(1).getFile() != FILE_GPR ||
3342        i->src(2).getFile() != FILE_GPR ||
3343        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3344       return;
3345 
3346    // TODO: gm107 can also do this for S32, maybe other chipsets as well
3347    if (i->dType != TYPE_F32)
3348       return;
3349 
3350    if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3351       return;
3352 
3353    ImmediateValue val;
3354    int s;
3355 
3356    if (i->src(0).getImmediate(val))
3357       s = 1;
3358    else if (i->src(1).getImmediate(val))
3359       s = 0;
3360    else
3361       return;
3362 
3363    if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3364       return;
3365 
3366    if (s == 1)
3367       i->swapSources(0, 1);
3368 
3369    Instruction *imm = i->getSrc(1)->getInsn();
3370    i->setSrc(1, imm->getSrc(0));
3371    if (post_ra_dead(imm))
3372       delete_Instruction(prog, imm);
3373 }
3374 
3375 bool
visit(Instruction * i)3376 PostRaLoadPropagation::visit(Instruction *i)
3377 {
3378    switch (i->op) {
3379    case OP_FMA:
3380    case OP_MAD:
3381       if (prog->getTarget()->getChipset() < 0xc0)
3382          handleMADforNV50(i);
3383       else
3384          handleMADforNVC0(i);
3385       break;
3386    default:
3387       break;
3388    }
3389 
3390    return true;
3391 }
3392 
3393 // =============================================================================
3394 
3395 // Common subexpression elimination. Stupid O^2 implementation.
3396 class LocalCSE : public Pass
3397 {
3398 private:
3399    virtual bool visit(BasicBlock *);
3400 
3401    inline bool tryReplace(Instruction **, Instruction *);
3402 
3403    DLList ops[OP_LAST + 1];
3404 };
3405 
3406 class GlobalCSE : public Pass
3407 {
3408 private:
3409    virtual bool visit(BasicBlock *);
3410 };
3411 
3412 bool
isActionEqual(const Instruction * that) const3413 Instruction::isActionEqual(const Instruction *that) const
3414 {
3415    if (this->op != that->op ||
3416        this->dType != that->dType ||
3417        this->sType != that->sType)
3418       return false;
3419    if (this->cc != that->cc)
3420       return false;
3421 
3422    if (this->asTex()) {
3423       if (memcmp(&this->asTex()->tex,
3424                  &that->asTex()->tex,
3425                  sizeof(this->asTex()->tex)))
3426          return false;
3427    } else
3428    if (this->asCmp()) {
3429       if (this->asCmp()->setCond != that->asCmp()->setCond)
3430          return false;
3431    } else
3432    if (this->asFlow()) {
3433       return false;
3434    } else {
3435       if (this->ipa != that->ipa ||
3436           this->lanes != that->lanes ||
3437           this->perPatch != that->perPatch)
3438          return false;
3439       if (this->postFactor != that->postFactor)
3440          return false;
3441    }
3442 
3443    if (this->subOp != that->subOp ||
3444        this->saturate != that->saturate ||
3445        this->rnd != that->rnd ||
3446        this->ftz != that->ftz ||
3447        this->dnz != that->dnz ||
3448        this->cache != that->cache ||
3449        this->mask != that->mask)
3450       return false;
3451 
3452    return true;
3453 }
3454 
3455 bool
isResultEqual(const Instruction * that) const3456 Instruction::isResultEqual(const Instruction *that) const
3457 {
3458    unsigned int d, s;
3459 
3460    // NOTE: location of discard only affects tex with liveOnly and quadops
3461    if (!this->defExists(0) && this->op != OP_DISCARD)
3462       return false;
3463 
3464    if (!isActionEqual(that))
3465       return false;
3466 
3467    if (this->predSrc != that->predSrc)
3468       return false;
3469 
3470    for (d = 0; this->defExists(d); ++d) {
3471       if (!that->defExists(d) ||
3472           !this->getDef(d)->equals(that->getDef(d), false))
3473          return false;
3474    }
3475    if (that->defExists(d))
3476       return false;
3477 
3478    for (s = 0; this->srcExists(s); ++s) {
3479       if (!that->srcExists(s))
3480          return false;
3481       if (this->src(s).mod != that->src(s).mod)
3482          return false;
3483       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3484          return false;
3485    }
3486    if (that->srcExists(s))
3487       return false;
3488 
3489    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3490       switch (src(0).getFile()) {
3491       case FILE_MEMORY_CONST:
3492       case FILE_SHADER_INPUT:
3493          return true;
3494       case FILE_SHADER_OUTPUT:
3495          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3496       default:
3497          return false;
3498       }
3499    }
3500 
3501    return true;
3502 }
3503 
3504 // pull through common expressions from different in-blocks
3505 bool
visit(BasicBlock * bb)3506 GlobalCSE::visit(BasicBlock *bb)
3507 {
3508    Instruction *phi, *next, *ik;
3509    int s;
3510 
3511    // TODO: maybe do this with OP_UNION, too
3512 
3513    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3514       next = phi->next;
3515       if (phi->getSrc(0)->refCount() > 1)
3516          continue;
3517       ik = phi->getSrc(0)->getInsn();
3518       if (!ik)
3519          continue; // probably a function input
3520       if (ik->defCount(0xff) > 1)
3521          continue; // too painful to check if we can really push this forward
3522       for (s = 1; phi->srcExists(s); ++s) {
3523          if (phi->getSrc(s)->refCount() > 1)
3524             break;
3525          if (!phi->getSrc(s)->getInsn() ||
3526              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3527             break;
3528       }
3529       if (!phi->srcExists(s)) {
3530          Instruction *entry = bb->getEntry();
3531          ik->bb->remove(ik);
3532          if (!entry || entry->op != OP_JOIN)
3533             bb->insertHead(ik);
3534          else
3535             bb->insertAfter(entry, ik);
3536          ik->setDef(0, phi->getDef(0));
3537          delete_Instruction(prog, phi);
3538       }
3539    }
3540 
3541    return true;
3542 }
3543 
3544 bool
tryReplace(Instruction ** ptr,Instruction * i)3545 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3546 {
3547    Instruction *old = *ptr;
3548 
3549    // TODO: maybe relax this later (causes trouble with OP_UNION)
3550    if (i->isPredicated())
3551       return false;
3552 
3553    if (!old->isResultEqual(i))
3554       return false;
3555 
3556    for (int d = 0; old->defExists(d); ++d)
3557       old->def(d).replace(i->getDef(d), false);
3558    delete_Instruction(prog, old);
3559    *ptr = NULL;
3560    return true;
3561 }
3562 
3563 bool
visit(BasicBlock * bb)3564 LocalCSE::visit(BasicBlock *bb)
3565 {
3566    unsigned int replaced;
3567 
3568    do {
3569       Instruction *ir, *next;
3570 
3571       replaced = 0;
3572 
3573       // will need to know the order of instructions
3574       int serial = 0;
3575       for (ir = bb->getFirst(); ir; ir = ir->next)
3576          ir->serial = serial++;
3577 
3578       for (ir = bb->getFirst(); ir; ir = next) {
3579          int s;
3580          Value *src = NULL;
3581 
3582          next = ir->next;
3583 
3584          if (ir->fixed) {
3585             ops[ir->op].insert(ir);
3586             continue;
3587          }
3588 
3589          for (s = 0; ir->srcExists(s); ++s)
3590             if (ir->getSrc(s)->asLValue())
3591                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3592                   src = ir->getSrc(s);
3593 
3594          if (src) {
3595             for (Value::UseIterator it = src->uses.begin();
3596                  it != src->uses.end(); ++it) {
3597                Instruction *ik = (*it)->getInsn();
3598                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3599                   if (tryReplace(&ir, ik))
3600                      break;
3601             }
3602          } else {
3603             DLLIST_FOR_EACH(&ops[ir->op], iter)
3604             {
3605                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3606                if (tryReplace(&ir, ik))
3607                   break;
3608             }
3609          }
3610 
3611          if (ir)
3612             ops[ir->op].insert(ir);
3613          else
3614             ++replaced;
3615       }
3616       for (unsigned int i = 0; i <= OP_LAST; ++i)
3617          ops[i].clear();
3618 
3619    } while (replaced);
3620 
3621    return true;
3622 }
3623 
3624 // =============================================================================
3625 
3626 // Remove computations of unused values.
3627 class DeadCodeElim : public Pass
3628 {
3629 public:
3630    bool buryAll(Program *);
3631 
3632 private:
3633    virtual bool visit(BasicBlock *);
3634 
3635    void checkSplitLoad(Instruction *ld); // for partially dead loads
3636 
3637    unsigned int deadCount;
3638 };
3639 
3640 bool
buryAll(Program * prog)3641 DeadCodeElim::buryAll(Program *prog)
3642 {
3643    do {
3644       deadCount = 0;
3645       if (!this->run(prog, false, false))
3646          return false;
3647    } while (deadCount);
3648 
3649    return true;
3650 }
3651 
3652 bool
visit(BasicBlock * bb)3653 DeadCodeElim::visit(BasicBlock *bb)
3654 {
3655    Instruction *prev;
3656 
3657    for (Instruction *i = bb->getExit(); i; i = prev) {
3658       prev = i->prev;
3659       if (i->isDead()) {
3660          ++deadCount;
3661          delete_Instruction(prog, i);
3662       } else
3663       if (i->defExists(1) &&
3664           i->subOp == 0 &&
3665           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3666          checkSplitLoad(i);
3667       } else
3668       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3669          if (i->op == OP_ATOM ||
3670              i->op == OP_SUREDP ||
3671              i->op == OP_SUREDB) {
3672             i->setDef(0, NULL);
3673             if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
3674                i->cache = CACHE_CV;
3675                i->op = OP_STORE;
3676                i->subOp = 0;
3677             }
3678          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3679             i->setDef(0, i->getDef(1));
3680             i->setDef(1, NULL);
3681          }
3682       }
3683    }
3684    return true;
3685 }
3686 
3687 // Each load can go into up to 4 destinations, any of which might potentially
3688 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3689 // of where the holes are. We find the first contiguous region, put it into
3690 // the first load, and then put the second contiguous region into the second
3691 // load. There can be at most 2 contiguous regions.
3692 //
3693 // Note that there are some restrictions, for example it's not possible to do
3694 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3695 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3696 // split into a 64-bit and 32-bit load.
3697 void
checkSplitLoad(Instruction * ld1)3698 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3699 {
3700    Instruction *ld2 = NULL; // can get at most 2 loads
3701    Value *def1[4];
3702    Value *def2[4];
3703    int32_t addr1, addr2;
3704    int32_t size1, size2;
3705    int d, n1, n2;
3706    uint32_t mask = 0xffffffff;
3707 
3708    for (d = 0; ld1->defExists(d); ++d)
3709       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3710          mask &= ~(1 << d);
3711    if (mask == 0xffffffff)
3712       return;
3713 
3714    addr1 = ld1->getSrc(0)->reg.data.offset;
3715    n1 = n2 = 0;
3716    size1 = size2 = 0;
3717 
3718    // Compute address/width for first load
3719    for (d = 0; ld1->defExists(d); ++d) {
3720       if (mask & (1 << d)) {
3721          if (size1 && (addr1 & 0x7))
3722             break;
3723          def1[n1] = ld1->getDef(d);
3724          size1 += def1[n1++]->reg.size;
3725       } else
3726       if (!n1) {
3727          addr1 += ld1->getDef(d)->reg.size;
3728       } else {
3729          break;
3730       }
3731    }
3732 
3733    // Scale back the size of the first load until it can be loaded. This
3734    // typically happens for TYPE_B96 loads.
3735    while (n1 &&
3736           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3737                                                 typeOfSize(size1))) {
3738       size1 -= def1[--n1]->reg.size;
3739       d--;
3740    }
3741 
3742    // Compute address/width for second load
3743    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3744       if (mask & (1 << d)) {
3745          assert(!size2 || !(addr2 & 0x7));
3746          def2[n2] = ld1->getDef(d);
3747          size2 += def2[n2++]->reg.size;
3748       } else if (!n2) {
3749          assert(!n2);
3750          addr2 += ld1->getDef(d)->reg.size;
3751       } else {
3752          break;
3753       }
3754    }
3755 
3756    // Make sure that we've processed all the values
3757    for (; ld1->defExists(d); ++d)
3758       assert(!(mask & (1 << d)));
3759 
3760    updateLdStOffset(ld1, addr1, func);
3761    ld1->setType(typeOfSize(size1));
3762    for (d = 0; d < 4; ++d)
3763       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3764 
3765    if (!n2)
3766       return;
3767 
3768    ld2 = cloneShallow(func, ld1);
3769    updateLdStOffset(ld2, addr2, func);
3770    ld2->setType(typeOfSize(size2));
3771    for (d = 0; d < 4; ++d)
3772       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3773 
3774    ld1->bb->insertAfter(ld1, ld2);
3775 }
3776 
3777 // =============================================================================
3778 
3779 #define RUN_PASS(l, n, f)                       \
3780    if (level >= (l)) {                          \
3781       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
3782          INFO("PEEPHOLE: %s\n", #n);            \
3783       n pass;                                   \
3784       if (!pass.f(this))                        \
3785          return false;                          \
3786    }
3787 
3788 bool
optimizeSSA(int level)3789 Program::optimizeSSA(int level)
3790 {
3791    RUN_PASS(1, DeadCodeElim, buryAll);
3792    RUN_PASS(1, CopyPropagation, run);
3793    RUN_PASS(1, MergeSplits, run);
3794    RUN_PASS(2, GlobalCSE, run);
3795    RUN_PASS(1, LocalCSE, run);
3796    RUN_PASS(2, AlgebraicOpt, run);
3797    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3798    RUN_PASS(1, ConstantFolding, foldAll);
3799    RUN_PASS(1, Split64BitOpPreRA, run);
3800    RUN_PASS(1, LoadPropagation, run);
3801    RUN_PASS(1, IndirectPropagation, run);
3802    RUN_PASS(2, MemoryOpt, run);
3803    RUN_PASS(2, LateAlgebraicOpt, run);
3804    RUN_PASS(2, LocalCSE, run);
3805    RUN_PASS(0, DeadCodeElim, buryAll);
3806 
3807    return true;
3808 }
3809 
3810 bool
optimizePostRA(int level)3811 Program::optimizePostRA(int level)
3812 {
3813    RUN_PASS(2, FlatteningPass, run);
3814    RUN_PASS(2, PostRaLoadPropagation, run);
3815 
3816    return true;
3817 }
3818 
3819 }
3820