1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
23 #include "nv50/codegen/nv50_ir.h"
24 #include "nv50/codegen/nv50_ir_build_util.h"
25 
26 #include "nv50_ir_target_nv50.h"
27 
28 namespace nv50_ir {
29 
30 // nv50 doesn't support 32 bit integer multiplication
31 //
32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
36 //       al*bl
37 //    ah*bl 00
38 //
39 // fffe0001 + fffe0001
40 static bool
expandIntegerMUL(BuildUtil * bld,Instruction * mul)41 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
42 {
43    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
44 
45    DataType fTy = mul->sType; // full type
46    DataType hTy;
47    switch (fTy) {
48    case TYPE_S32: hTy = TYPE_S16; break;
49    case TYPE_U32: hTy = TYPE_U16; break;
50    case TYPE_U64: hTy = TYPE_U32; break;
51    case TYPE_S64: hTy = TYPE_S32; break;
52    default:
53       return false;
54    }
55    unsigned int fullSize = typeSizeof(fTy);
56    unsigned int halfSize = typeSizeof(hTy);
57 
58    Instruction *i[9];
59 
60    bld->setPosition(mul, true);
61 
62    Value *a[2], *b[2];
63    Value *c[2];
64    Value *t[4];
65    for (int j = 0; j < 4; ++j)
66       t[j] = bld->getSSA(fullSize);
67 
68    // split sources into halves
69    i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
70    i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
71 
72    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
73    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
74    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
75    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
76 
77    if (highResult) {
78       Value *r[3];
79       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
80       c[0] = bld->getSSA(1, FILE_FLAGS);
81       c[1] = bld->getSSA(1, FILE_FLAGS);
82       for (int j = 0; j < 3; ++j)
83          r[j] = bld->getSSA(fullSize);
84 
85       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
86       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
87       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
88       i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
89 
90       // set carry defs / sources
91       i[3]->setFlagsDef(1, c[0]);
92       i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
93       i[6]->setPredicate(CC_C, c[0]);
94       i[5]->setFlagsSrc(3, c[1]);
95    } else {
96       bld->mkMov(mul->getDef(0), t[3]);
97    }
98    delete_Instruction(bld->getProgram(), mul);
99 
100    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
101       if (i[j])
102          i[j]->sType = hTy;
103 
104    return true;
105 }
106 
107 #define QOP_ADD  0
108 #define QOP_SUBR 1
109 #define QOP_SUB  2
110 #define QOP_MOV2 3
111 
112 //             UL UR LL LR
113 #define QUADOP(q, r, s, t)            \
114    ((QOP_##q << 6) | (QOP_##r << 4) | \
115     (QOP_##s << 2) | (QOP_##t << 0))
116 
117 class NV50LegalizePostRA : public Pass
118 {
119 private:
120    virtual bool visit(Function *);
121    virtual bool visit(BasicBlock *);
122 
123    void handlePRERET(FlowInstruction *);
124    void replaceZero(Instruction *);
125    void split64BitOp(Instruction *);
126 
127    LValue *r63;
128 };
129 
130 bool
visit(Function * fn)131 NV50LegalizePostRA::visit(Function *fn)
132 {
133    Program *prog = fn->getProgram();
134 
135    r63 = new_LValue(fn, FILE_GPR);
136    r63->reg.data.id = 63;
137 
138    // this is actually per-program, but we can do it all on visiting main()
139    std::list<Instruction *> *outWrites =
140       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
141 
142    if (outWrites) {
143       for (std::list<Instruction *>::iterator it = outWrites->begin();
144            it != outWrites->end(); ++it)
145          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
146       // instructions will be deleted on exit
147       outWrites->clear();
148    }
149 
150    return true;
151 }
152 
153 void
replaceZero(Instruction * i)154 NV50LegalizePostRA::replaceZero(Instruction *i)
155 {
156    for (int s = 0; i->srcExists(s); ++s) {
157       ImmediateValue *imm = i->getSrc(s)->asImm();
158       if (imm && imm->reg.data.u64 == 0)
159          i->setSrc(s, r63);
160    }
161 }
162 
163 void
split64BitOp(Instruction * i)164 NV50LegalizePostRA::split64BitOp(Instruction *i)
165 {
166    if (i->dType == TYPE_F64) {
167       if (i->op == OP_MAD)
168          i->op = OP_FMA;
169       if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
170           i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
171           i->op == OP_SET)
172          return;
173       i->dType = i->sType = TYPE_U32;
174 
175       i->bb->insertAfter(i, cloneForward(func, i));
176    }
177 }
178 
179 // Emulate PRERET: jump to the target and call to the origin from there
180 //
181 // WARNING: atm only works if BBs are affected by at most a single PRERET
182 //
183 // BB:0
184 // preret BB:3
185 // (...)
186 // BB:3
187 // (...)
188 //             --->
189 // BB:0
190 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
191 // (...)
192 // BB:3
193 // bra BB:3 + n1 (skip the call)
194 // call BB:0 + n2 (skip bra at beginning of BB:0)
195 // (...)
196 void
handlePRERET(FlowInstruction * pre)197 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
198 {
199    BasicBlock *bbE = pre->bb;
200    BasicBlock *bbT = pre->target.bb;
201 
202    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
203    bbE->remove(pre);
204    bbE->insertHead(pre);
205 
206    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
207    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
208 
209    bbT->insertHead(call);
210    bbT->insertHead(skip);
211 
212    // NOTE: maybe split blocks to prevent the instructions from moving ?
213 
214    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
215    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
216 }
217 
218 bool
visit(BasicBlock * bb)219 NV50LegalizePostRA::visit(BasicBlock *bb)
220 {
221    Instruction *i, *next;
222 
223    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
224    for (i = bb->getFirst(); i; i = next) {
225       next = i->next;
226       if (i->isNop()) {
227          bb->remove(i);
228       } else
229       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
230          handlePRERET(i->asFlow());
231       } else {
232          if (i->op != OP_MOV && i->op != OP_PFETCH &&
233              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
234             replaceZero(i);
235          if (typeSizeof(i->dType) == 8)
236             split64BitOp(i);
237       }
238    }
239    if (!bb->getEntry())
240       return true;
241 
242    return true;
243 }
244 
245 class NV50LegalizeSSA : public Pass
246 {
247 public:
248    NV50LegalizeSSA(Program *);
249 
250    virtual bool visit(BasicBlock *bb);
251 
252 private:
253    void propagateWriteToOutput(Instruction *);
254    void handleDIV(Instruction *);
255    void handleMOD(Instruction *);
256    void handleMUL(Instruction *);
257    void handleAddrDef(Instruction *);
258 
259    inline bool isARL(const Instruction *) const;
260 
261    BuildUtil bld;
262 
263    std::list<Instruction *> *outWrites;
264 };
265 
NV50LegalizeSSA(Program * prog)266 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
267 {
268    bld.setProgram(prog);
269 
270    if (prog->optLevel >= 2 &&
271        (prog->getType() == Program::TYPE_GEOMETRY ||
272         prog->getType() == Program::TYPE_VERTEX))
273       outWrites =
274          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
275    else
276       outWrites = NULL;
277 }
278 
279 void
propagateWriteToOutput(Instruction * st)280 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
281 {
282    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
283       return;
284 
285    // check def instruction can store
286    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
287 
288    // TODO: move exports (if beneficial) in common opt pass
289    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
290       return;
291    for (int s = 0; di->srcExists(s); ++s)
292       if (di->src(s).getFile() == FILE_IMMEDIATE)
293          return;
294 
295    // We cannot set defs to non-lvalues before register allocation, so
296    // save & remove (to save registers) the exports and replace later.
297    outWrites->push_back(st);
298    st->bb->remove(st);
299 }
300 
301 bool
isARL(const Instruction * i) const302 NV50LegalizeSSA::isARL(const Instruction *i) const
303 {
304    ImmediateValue imm;
305 
306    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
307       return false;
308    if (!i->src(1).getImmediate(imm))
309       return false;
310    return imm.isInteger(0);
311 }
312 
313 void
handleAddrDef(Instruction * i)314 NV50LegalizeSSA::handleAddrDef(Instruction *i)
315 {
316    Instruction *arl;
317 
318    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
319 
320    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
321    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
322       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
323          return;
324       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
325          return;
326    }
327 
328    // turn $a sources into $r sources (can't operate on $a)
329    for (int s = 0; i->srcExists(s); ++s) {
330       Value *a = i->getSrc(s);
331       Value *r;
332       if (a->reg.file == FILE_ADDRESS) {
333          if (a->getInsn() && isARL(a->getInsn())) {
334             i->setSrc(s, a->getInsn()->getSrc(0));
335          } else {
336             bld.setPosition(i, false);
337             r = bld.getSSA();
338             bld.mkMov(r, a);
339             i->setSrc(s, r);
340          }
341       }
342    }
343    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
344       return;
345 
346    // turn result back into $a
347    bld.setPosition(i, true);
348    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
349    i->setDef(0, arl->getSrc(0));
350 }
351 
352 void
handleMUL(Instruction * mul)353 NV50LegalizeSSA::handleMUL(Instruction *mul)
354 {
355    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
356       return;
357    Value *def = mul->getDef(0);
358    Value *pred = mul->getPredicate();
359    CondCode cc = mul->cc;
360    if (pred)
361       mul->setPredicate(CC_ALWAYS, NULL);
362 
363    if (mul->op == OP_MAD) {
364       Instruction *add = mul;
365       bld.setPosition(add, false);
366       Value *res = cloneShallow(func, mul->getDef(0));
367       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
368       add->op = OP_ADD;
369       add->setSrc(0, mul->getDef(0));
370       add->setSrc(1, add->getSrc(2));
371       for (int s = 2; add->srcExists(s); ++s)
372          add->setSrc(s, NULL);
373       mul->subOp = add->subOp;
374       add->subOp = 0;
375    }
376    expandIntegerMUL(&bld, mul);
377    if (pred)
378       def->getInsn()->setPredicate(cc, pred);
379 }
380 
381 // Use f32 division: first compute an approximate result, use it to reduce
382 // the dividend, which should then be representable as f32, divide the reduced
383 // dividend, and add the quotients.
384 void
handleDIV(Instruction * div)385 NV50LegalizeSSA::handleDIV(Instruction *div)
386 {
387    const DataType ty = div->sType;
388 
389    if (ty != TYPE_U32 && ty != TYPE_S32)
390       return;
391 
392    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
393 
394    bld.setPosition(div, false);
395 
396    Value *a, *af = bld.getSSA();
397    Value *b, *bf = bld.getSSA();
398 
399    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
400    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
401 
402    if (isSignedType(ty)) {
403       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
404       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
405       a = bld.getSSA();
406       b = bld.getSSA();
407       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
408       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
409    } else {
410       a = div->getSrc(0);
411       b = div->getSrc(1);
412    }
413 
414    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
415    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
416 
417    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
418    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
419 
420    // get error of 1st result
421    expandIntegerMUL(&bld,
422       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
423    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
424 
425    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
426 
427    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
428    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
429       ->rnd = ROUND_Z;
430    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
431 
432    // correction: if modulus >= divisor, add 1
433    expandIntegerMUL(&bld,
434       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
435    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
436    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
437    if (!isSignedType(ty)) {
438       div->op = OP_SUB;
439       div->setSrc(0, q);
440       div->setSrc(1, s);
441    } else {
442       t = q;
443       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
444       s = bld.getSSA();
445       t = bld.getSSA();
446       // fix the sign
447       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
448          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
449       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
450       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
451 
452       div->op = OP_UNION;
453       div->setSrc(0, s);
454       div->setSrc(1, t);
455    }
456 }
457 
458 void
handleMOD(Instruction * mod)459 NV50LegalizeSSA::handleMOD(Instruction *mod)
460 {
461    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
462       return;
463    bld.setPosition(mod, false);
464 
465    Value *q = bld.getSSA();
466    Value *m = bld.getSSA();
467 
468    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
469    handleDIV(q->getInsn());
470 
471    bld.setPosition(mod, false);
472    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
473 
474    mod->op = OP_SUB;
475    mod->setSrc(1, m);
476 }
477 
478 bool
visit(BasicBlock * bb)479 NV50LegalizeSSA::visit(BasicBlock *bb)
480 {
481    Instruction *insn, *next;
482    // skipping PHIs (don't pass them to handleAddrDef) !
483    for (insn = bb->getEntry(); insn; insn = next) {
484       next = insn->next;
485 
486       switch (insn->op) {
487       case OP_EXPORT:
488          if (outWrites)
489             propagateWriteToOutput(insn);
490          break;
491       case OP_DIV:
492          handleDIV(insn);
493          break;
494       case OP_MOD:
495          handleMOD(insn);
496          break;
497       case OP_MAD:
498       case OP_MUL:
499          handleMUL(insn);
500          break;
501       default:
502          break;
503       }
504 
505       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
506          handleAddrDef(insn);
507    }
508    return true;
509 }
510 
511 class NV50LoweringPreSSA : public Pass
512 {
513 public:
514    NV50LoweringPreSSA(Program *);
515 
516 private:
517    virtual bool visit(Instruction *);
518    virtual bool visit(Function *);
519 
520    bool handleRDSV(Instruction *);
521    bool handleWRSV(Instruction *);
522 
523    bool handleEXPORT(Instruction *);
524 
525    bool handleDIV(Instruction *);
526    bool handleSQRT(Instruction *);
527    bool handlePOW(Instruction *);
528 
529    bool handleSET(Instruction *);
530    bool handleSLCT(CmpInstruction *);
531    bool handleSELP(Instruction *);
532 
533    bool handleTEX(TexInstruction *);
534    bool handleTXB(TexInstruction *); // I really
535    bool handleTXL(TexInstruction *); // hate
536    bool handleTXD(TexInstruction *); // these 3
537 
538    bool handleCALL(Instruction *);
539    bool handlePRECONT(Instruction *);
540    bool handleCONT(Instruction *);
541 
542    void checkPredicate(Instruction *);
543 
544 private:
545    const Target *const targ;
546 
547    BuildUtil bld;
548 
549    Value *tid;
550 };
551 
NV50LoweringPreSSA(Program * prog)552 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
553    targ(prog->getTarget()), tid(NULL)
554 {
555    bld.setProgram(prog);
556 }
557 
558 bool
visit(Function * f)559 NV50LoweringPreSSA::visit(Function *f)
560 {
561    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
562 
563    if (prog->getType() == Program::TYPE_COMPUTE) {
564       // Add implicit "thread id" argument in $r0 to the function
565       Value *arg = new_LValue(func, FILE_GPR);
566       arg->reg.data.id = 0;
567       f->ins.push_back(arg);
568 
569       bld.setPosition(root, false);
570       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
571    }
572 
573    return true;
574 }
575 
576 // move array source to first slot, convert to u16, add indirections
577 bool
handleTEX(TexInstruction * i)578 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
579 {
580    const int arg = i->tex.target.getArgCount();
581    const int dref = arg;
582    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
583 
584    // dref comes before bias/lod
585    if (i->tex.target.isShadow())
586       if (i->op == OP_TXB || i->op == OP_TXL)
587          i->swapSources(dref, lod);
588 
589    // array index must be converted to u32
590    if (i->tex.target.isArray()) {
591       Value *layer = i->getSrc(arg - 1);
592       LValue *src = new_LValue(func, FILE_GPR);
593       bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
594       bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
595       i->setSrc(arg - 1, src);
596 
597       if (i->tex.target.isCube()) {
598          // Value *face = layer;
599          Value *x, *y;
600          x = new_LValue(func, FILE_GPR);
601          y = new_LValue(func, FILE_GPR);
602          layer = new_LValue(func, FILE_GPR);
603 
604          i->tex.target = TEX_TARGET_2D_ARRAY;
605 
606          // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
607          bld.mkMov(x, i->getSrc(0));
608          bld.mkMov(y, i->getSrc(1));
609          bld.mkMov(layer, i->getSrc(3));
610 
611          i->setSrc(0, x);
612          i->setSrc(1, y);
613          i->setSrc(2, layer);
614          i->setSrc(3, i->getSrc(4));
615          i->setSrc(4, NULL);
616       }
617    }
618 
619    // texel offsets are 3 immediate fields in the instruction,
620    // nv50 cannot do textureGatherOffsets
621    assert(i->tex.useOffsets <= 1);
622 
623    return true;
624 }
625 
626 // Bias must be equal for all threads of a quad or lod calculation will fail.
627 //
628 // The lanes of a quad are grouped by the bit in the condition register they
629 // have set, which is selected by differing bias values.
630 // Move the input values for TEX into a new register set for each group and
631 // execute TEX only for a specific group.
632 // We always need to use 4 new registers for the inputs/outputs because the
633 // implicitly calculated derivatives must be correct.
634 //
635 // TODO: move to SSA phase so we can easily determine whether bias is constant
636 bool
handleTXB(TexInstruction * i)637 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
638 {
639    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
640    int l, d;
641 
642    handleTEX(i);
643    Value *bias = i->getSrc(i->tex.target.getArgCount());
644    if (bias->isUniform())
645       return true;
646 
647    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
648                                  bld.loadImm(NULL, 1));
649    bld.setPosition(cond, false);
650 
651    for (l = 1; l < 4; ++l) {
652       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
653       Value *bit = bld.getSSA();
654       Value *pred = bld.getScratch(1, FILE_FLAGS);
655       Value *imm = bld.loadImm(NULL, (1 << l));
656       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
657       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
658       cond->setSrc(l, bit);
659    }
660    Value *flags = bld.getScratch(1, FILE_FLAGS);
661    bld.setPosition(cond, true);
662    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
663 
664    Instruction *tex[4];
665    for (l = 0; l < 4; ++l) {
666       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
667       bld.insert(tex[l]);
668    }
669 
670    Value *res[4][4];
671    for (d = 0; i->defExists(d); ++d)
672       res[0][d] = tex[0]->getDef(d);
673    for (l = 1; l < 4; ++l) {
674       for (d = 0; tex[l]->defExists(d); ++d) {
675          res[l][d] = cloneShallow(func, res[0][d]);
676          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
677       }
678    }
679 
680    for (d = 0; i->defExists(d); ++d) {
681       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
682       for (l = 0; l < 4; ++l)
683          dst->setSrc(l, res[l][d]);
684    }
685    delete_Instruction(prog, i);
686    return true;
687 }
688 
689 // LOD must be equal for all threads of a quad.
690 // Unlike with TXB, here we can just diverge since there's no LOD calculation
691 // that would require all 4 threads' sources to be set up properly.
692 bool
handleTXL(TexInstruction * i)693 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
694 {
695    handleTEX(i);
696    Value *lod = i->getSrc(i->tex.target.getArgCount());
697    if (lod->isUniform())
698       return true;
699 
700    BasicBlock *currBB = i->bb;
701    BasicBlock *texiBB = i->bb->splitBefore(i, false);
702    BasicBlock *joinBB = i->bb->splitAfter(i);
703 
704    bld.setPosition(currBB, true);
705    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
706 
707    for (int l = 0; l <= 3; ++l) {
708       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
709       Value *pred = bld.getScratch(1, FILE_FLAGS);
710       bld.setPosition(currBB, true);
711       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
712       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
713       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
714       if (l <= 2) {
715          BasicBlock *laneBB = new BasicBlock(func);
716          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
717          currBB = laneBB;
718       }
719    }
720    bld.setPosition(joinBB, false);
721    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
722    return true;
723 }
724 
725 bool
handleTXD(TexInstruction * i)726 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
727 {
728    static const uint8_t qOps[4][2] =
729    {
730       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
731       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
732       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
733       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
734    };
735    Value *def[4][4];
736    Value *crd[3];
737    Instruction *tex;
738    Value *zero = bld.loadImm(bld.getSSA(), 0);
739    int l, c;
740    const int dim = i->tex.target.getDim();
741 
742    handleTEX(i);
743    i->op = OP_TEX; // no need to clone dPdx/dPdy later
744 
745    for (c = 0; c < dim; ++c)
746       crd[c] = bld.getScratch();
747 
748    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
749    for (l = 0; l < 4; ++l) {
750       // mov coordinates from lane l to all lanes
751       for (c = 0; c < dim; ++c)
752          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
753       // add dPdx from lane l to lanes dx
754       for (c = 0; c < dim; ++c)
755          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
756       // add dPdy from lane l to lanes dy
757       for (c = 0; c < dim; ++c)
758          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
759       // texture
760       bld.insert(tex = cloneForward(func, i));
761       for (c = 0; c < dim; ++c)
762          tex->setSrc(c, crd[c]);
763       // save results
764       for (c = 0; i->defExists(c); ++c) {
765          Instruction *mov;
766          def[c][l] = bld.getSSA();
767          mov = bld.mkMov(def[c][l], tex->getDef(c));
768          mov->fixed = 1;
769          mov->lanes = 1 << l;
770       }
771    }
772    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
773 
774    for (c = 0; i->defExists(c); ++c) {
775       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
776       for (l = 0; l < 4; ++l)
777          u->setSrc(l, def[c][l]);
778    }
779 
780    i->bb->remove(i);
781    return true;
782 }
783 
784 bool
handleSET(Instruction * i)785 NV50LoweringPreSSA::handleSET(Instruction *i)
786 {
787    if (i->dType == TYPE_F32) {
788       bld.setPosition(i, true);
789       i->dType = TYPE_U32;
790       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
791       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
792    }
793    return true;
794 }
795 
796 bool
handleSLCT(CmpInstruction * i)797 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
798 {
799    Value *src0 = bld.getSSA();
800    Value *src1 = bld.getSSA();
801    Value *pred = bld.getScratch(1, FILE_FLAGS);
802 
803    Value *v0 = i->getSrc(0);
804    Value *v1 = i->getSrc(1);
805    // XXX: these probably shouldn't be immediates in the first place ...
806    if (v0->asImm())
807       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
808    if (v1->asImm())
809       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
810 
811    bld.setPosition(i, true);
812    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
813    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
814    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
815 
816    bld.setPosition(i, false);
817    i->op = OP_SET;
818    i->setFlagsDef(0, pred);
819    i->dType = TYPE_U8;
820    i->setSrc(0, i->getSrc(2));
821    i->setSrc(2, NULL);
822    i->setSrc(1, bld.loadImm(NULL, 0));
823 
824    return true;
825 }
826 
827 bool
handleSELP(Instruction * i)828 NV50LoweringPreSSA::handleSELP(Instruction *i)
829 {
830    Value *src0 = bld.getSSA();
831    Value *src1 = bld.getSSA();
832 
833    Value *v0 = i->getSrc(0);
834    Value *v1 = i->getSrc(1);
835    if (v0->asImm())
836       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
837    if (v1->asImm())
838       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
839 
840    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
841    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
842    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
843    delete_Instruction(prog, i);
844    return true;
845 }
846 
847 bool
handleWRSV(Instruction * i)848 NV50LoweringPreSSA::handleWRSV(Instruction *i)
849 {
850    Symbol *sym = i->getSrc(0)->asSym();
851 
852    // these are all shader outputs, $sreg are not writeable
853    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
854    if (addr >= 0x400)
855       return false;
856    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
857 
858    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
859 
860    bld.getBB()->remove(i);
861    return true;
862 }
863 
864 bool
handleCALL(Instruction * i)865 NV50LoweringPreSSA::handleCALL(Instruction *i)
866 {
867    if (prog->getType() == Program::TYPE_COMPUTE) {
868       // Add implicit "thread id" argument in $r0 to the function
869       i->setSrc(i->srcCount(), tid);
870    }
871    return true;
872 }
873 
874 bool
handlePRECONT(Instruction * i)875 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
876 {
877    delete_Instruction(prog, i);
878    return true;
879 }
880 
881 bool
handleCONT(Instruction * i)882 NV50LoweringPreSSA::handleCONT(Instruction *i)
883 {
884    i->op = OP_BRA;
885    return true;
886 }
887 
888 bool
handleRDSV(Instruction * i)889 NV50LoweringPreSSA::handleRDSV(Instruction *i)
890 {
891    Symbol *sym = i->getSrc(0)->asSym();
892    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
893    Value *def = i->getDef(0);
894    SVSemantic sv = sym->reg.data.sv.sv;
895    int idx = sym->reg.data.sv.index;
896 
897    if (addr >= 0x400) // mov $sreg
898       return true;
899 
900    switch (sv) {
901    case SV_POSITION:
902       assert(prog->getType() == Program::TYPE_FRAGMENT);
903       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
904       break;
905    case SV_FACE:
906       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
907       if (i->dType == TYPE_F32) {
908          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
909          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
910       }
911       break;
912    case SV_NCTAID:
913    case SV_CTAID:
914    case SV_NTID:
915       if ((sv == SV_NCTAID && idx >= 2) ||
916           (sv == SV_NTID && idx >= 3)) {
917          bld.mkMov(def, bld.mkImm(1));
918       } else if (sv == SV_CTAID && idx >= 2) {
919          bld.mkMov(def, bld.mkImm(0));
920       } else {
921          Value *x = bld.getSSA(2);
922          bld.mkOp1(OP_LOAD, TYPE_U16, x,
923                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
924          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
925       }
926       break;
927    case SV_TID:
928       if (idx == 0) {
929          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
930       } else if (idx == 1) {
931          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
932          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
933       } else if (idx == 2) {
934          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
935       } else {
936          bld.mkMov(def, bld.mkImm(0));
937       }
938       break;
939    default:
940       bld.mkFetch(i->getDef(0), i->dType,
941                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
942       break;
943    }
944    bld.getBB()->remove(i);
945    return true;
946 }
947 
948 bool
handleDIV(Instruction * i)949 NV50LoweringPreSSA::handleDIV(Instruction *i)
950 {
951    if (!isFloatType(i->dType))
952       return true;
953    bld.setPosition(i, false);
954    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
955    i->op = OP_MUL;
956    i->setSrc(1, rcp->getDef(0));
957    return true;
958 }
959 
960 bool
handleSQRT(Instruction * i)961 NV50LoweringPreSSA::handleSQRT(Instruction *i)
962 {
963    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
964                                 bld.getSSA(), i->getSrc(0));
965    i->op = OP_MUL;
966    i->setSrc(1, rsq->getDef(0));
967 
968    return true;
969 }
970 
971 bool
handlePOW(Instruction * i)972 NV50LoweringPreSSA::handlePOW(Instruction *i)
973 {
974    LValue *val = bld.getScratch();
975 
976    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
977    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
978    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
979 
980    i->op = OP_EX2;
981    i->setSrc(0, val);
982    i->setSrc(1, NULL);
983 
984    return true;
985 }
986 
987 bool
handleEXPORT(Instruction * i)988 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
989 {
990    if (prog->getType() == Program::TYPE_FRAGMENT) {
991       if (i->getIndirect(0, 0)) {
992          // TODO: redirect to l[] here, load to GPRs at exit
993          return false;
994       } else {
995          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
996 
997          i->op = OP_MOV;
998          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
999          i->src(0).set(i->src(1));
1000          i->setSrc(1, NULL);
1001          i->setDef(0, new_LValue(func, FILE_GPR));
1002          i->getDef(0)->reg.data.id = id;
1003 
1004          prog->maxGPR = MAX2(prog->maxGPR, id);
1005       }
1006    }
1007    return true;
1008 }
1009 
1010 // Set flags according to predicate and make the instruction read $cX.
1011 void
checkPredicate(Instruction * insn)1012 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1013 {
1014    Value *pred = insn->getPredicate();
1015    Value *cdst;
1016 
1017    if (!pred || pred->reg.file == FILE_FLAGS)
1018       return;
1019    cdst = bld.getSSA(1, FILE_FLAGS);
1020 
1021    bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
1022 
1023    insn->setPredicate(insn->cc, cdst);
1024 }
1025 
1026 //
1027 // - add quadop dance for texturing
1028 // - put FP outputs in GPRs
1029 // - convert instruction sequences
1030 //
1031 bool
visit(Instruction * i)1032 NV50LoweringPreSSA::visit(Instruction *i)
1033 {
1034    bld.setPosition(i, false);
1035 
1036    if (i->cc != CC_ALWAYS)
1037       checkPredicate(i);
1038 
1039    switch (i->op) {
1040    case OP_TEX:
1041    case OP_TXF:
1042    case OP_TXG:
1043       return handleTEX(i->asTex());
1044    case OP_TXB:
1045       return handleTXB(i->asTex());
1046    case OP_TXL:
1047       return handleTXL(i->asTex());
1048    case OP_TXD:
1049       return handleTXD(i->asTex());
1050    case OP_EX2:
1051       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1052       i->setSrc(0, i->getDef(0));
1053       break;
1054    case OP_SET:
1055       return handleSET(i);
1056    case OP_SLCT:
1057       return handleSLCT(i->asCmp());
1058    case OP_SELP:
1059       return handleSELP(i);
1060    case OP_POW:
1061       return handlePOW(i);
1062    case OP_DIV:
1063       return handleDIV(i);
1064    case OP_SQRT:
1065       return handleSQRT(i);
1066    case OP_EXPORT:
1067       return handleEXPORT(i);
1068    case OP_RDSV:
1069       return handleRDSV(i);
1070    case OP_WRSV:
1071       return handleWRSV(i);
1072    case OP_CALL:
1073       return handleCALL(i);
1074    case OP_PRECONT:
1075       return handlePRECONT(i);
1076    case OP_CONT:
1077       return handleCONT(i);
1078    default:
1079       break;
1080    }
1081    return true;
1082 }
1083 
1084 bool
runLegalizePass(Program * prog,CGStage stage) const1085 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1086 {
1087    bool ret = false;
1088 
1089    if (stage == CG_STAGE_PRE_SSA) {
1090       NV50LoweringPreSSA pass(prog);
1091       ret = pass.run(prog, false, true);
1092    } else
1093    if (stage == CG_STAGE_SSA) {
1094       if (!prog->targetPriv)
1095          prog->targetPriv = new std::list<Instruction *>();
1096       NV50LegalizeSSA pass(prog);
1097       ret = pass.run(prog, false, true);
1098    } else
1099    if (stage == CG_STAGE_POST_RA) {
1100       NV50LegalizePostRA pass;
1101       ret = pass.run(prog, false, true);
1102       if (prog->targetPriv)
1103          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1104    }
1105    return ret;
1106 }
1107 
1108 } // namespace nv50_ir
1109