1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25 
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
28 
29 #include <limits>
30 
31 namespace nv50_ir {
32 
33 #define QOP_ADD  0
34 #define QOP_SUBR 1
35 #define QOP_SUB  2
36 #define QOP_MOV2 3
37 
38 //             UL UR LL LR
39 #define QUADOP(q, r, s, t)                      \
40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
41     (QOP_##s << 2) | (QOP_##t << 0))
42 
43 void
handleDIV(Instruction * i)44 NVC0LegalizeSSA::handleDIV(Instruction *i)
45 {
46    FlowInstruction *call;
47    int builtin;
48 
49    bld.setPosition(i, false);
50 
51    // Generate movs to the input regs for the call we want to generate
52    for (int s = 0; i->srcExists(s); ++s) {
53       Instruction *ld = i->getSrc(s)->getInsn();
54       assert(ld->getSrc(0) != NULL);
55       // check if we are moving an immediate, propagate it in that case
56       if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
57             !(ld->src(0).getFile() == FILE_IMMEDIATE))
58          bld.mkMovToReg(s, i->getSrc(s));
59       else {
60          bld.mkMovToReg(s, ld->getSrc(0));
61          // Clear the src, to make code elimination possible here before we
62          // delete the instruction i later
63          i->setSrc(s, NULL);
64          if (ld->isDead())
65             delete_Instruction(prog, ld);
66       }
67    }
68 
69    switch (i->dType) {
70    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
71    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
72    default:
73       return;
74    }
75    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
76    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
77    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
78    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
79 
80    call->fixed = 1;
81    call->absolute = call->builtin = 1;
82    call->target.builtin = builtin;
83    delete_Instruction(prog, i);
84 }
85 
86 void
handleRCPRSQ(Instruction * i)87 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
88 {
89    assert(i->dType == TYPE_F64);
90    // There are instructions that will compute the high 32 bits of the 64-bit
91    // float. We will just stick 0 in the bottom 32 bits.
92 
93    bld.setPosition(i, false);
94 
95    // 1. Take the source and it up.
96    Value *src[2], *dst[2], *def = i->getDef(0);
97    bld.mkSplit(src, 4, i->getSrc(0));
98 
99    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
100    dst[0] = bld.loadImm(NULL, 0);
101    dst[1] = bld.getSSA();
102 
103    // 3. The new version of the instruction takes the high 32 bits of the
104    // source and outputs the high 32 bits of the destination.
105    i->setSrc(0, src[1]);
106    i->setDef(0, dst[1]);
107    i->setType(TYPE_F32);
108    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
109 
110    // 4. Recombine the two dst pieces back into the original destination.
111    bld.setPosition(i, true);
112    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
113 }
114 
115 void
handleFTZ(Instruction * i)116 NVC0LegalizeSSA::handleFTZ(Instruction *i)
117 {
118    // Only want to flush float inputs
119    assert(i->sType == TYPE_F32);
120 
121    // If we're already flushing denorms (and NaN's) to zero, no need for this.
122    if (i->dnz)
123       return;
124 
125    // Only certain classes of operations can flush
126    OpClass cls = prog->getTarget()->getOpClass(i->op);
127    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
128        cls != OPCLASS_CONVERT)
129       return;
130 
131    i->ftz = true;
132 }
133 
134 void
handleTEXLOD(TexInstruction * i)135 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
136 {
137    if (i->tex.levelZero)
138       return;
139 
140    ImmediateValue lod;
141 
142    // The LOD argument comes right after the coordinates (before depth bias,
143    // offsets, etc).
144    int arg = i->tex.target.getArgCount();
145 
146    // SM30+ stores the indirect handle as a separate arg, which comes before
147    // the LOD.
148    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
149        i->tex.rIndirectSrc >= 0)
150       arg++;
151    // SM20 stores indirect handle combined with array coordinate
152    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
153        !i->tex.target.isArray() &&
154        i->tex.rIndirectSrc >= 0)
155       arg++;
156 
157    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
158       return;
159 
160    if (i->op == OP_TXL)
161       i->op = OP_TEX;
162    i->tex.levelZero = true;
163    i->moveSources(arg + 1, -1);
164 }
165 
166 void
handleShift(Instruction * lo)167 NVC0LegalizeSSA::handleShift(Instruction *lo)
168 {
169    Value *shift = lo->getSrc(1);
170    Value *dst64 = lo->getDef(0);
171    Value *src[2], *dst[2];
172    operation op = lo->op;
173 
174    bld.setPosition(lo, false);
175 
176    bld.mkSplit(src, 4, lo->getSrc(0));
177 
178    // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
179    // be completely emulated. For SM35+, we can use the more directed SHF
180    // operations.
181    if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
182       // The strategy here is to handle shifts >= 32 and less than 32 as
183       // separate parts.
184       //
185       // For SHL:
186       // If the shift is <= 32, then
187       //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
188       // If the shift is > 32, then
189       //   (HI,LO) << x = (LO << (x - 32), 0)
190       //
191       // For SHR:
192       // If the shift is <= 32, then
193       //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
194       // If the shift is > 32, then
195       //   (HI,LO) >> x = (0, HI >> (x - 32))
196       //
197       // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
198       // can use to our advantage. Also note the structural similarities
199       // between the right/left cases. The main difference is swapping hi/lo
200       // on input and output.
201 
202       Value *x32_minus_shift, *pred, *hi1, *hi2;
203       DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
204       operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
205       if (op == OP_SHR)
206          std::swap(src[0], src[1]);
207       bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
208          ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
209       bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
210                 TYPE_U32, shift, bld.mkImm(32));
211       // Compute HI (shift <= 32)
212       bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
213                 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
214                 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
215          ->setPredicate(CC_P, pred);
216       // Compute LO (all shift values)
217       bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
218       // Compute HI (shift > 32)
219       bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
220                 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
221          ->setPredicate(CC_NOT_P, pred);
222       bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
223       if (op == OP_SHR)
224          std::swap(dst[0], dst[1]);
225       bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
226       delete_Instruction(prog, lo);
227       return;
228    }
229 
230    Instruction *hi = new_Instruction(func, op, TYPE_U32);
231    lo->bb->insertAfter(lo, hi);
232 
233    hi->sType = lo->sType;
234    lo->dType = TYPE_U32;
235 
236    hi->setDef(0, (dst[1] = bld.getSSA()));
237    if (lo->op == OP_SHR)
238       hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
239    lo->setDef(0, (dst[0] = bld.getSSA()));
240 
241    bld.setPosition(hi, true);
242 
243    if (lo->op == OP_SHL)
244       std::swap(hi, lo);
245 
246    hi->setSrc(0, new_ImmediateValue(prog, 0u));
247    hi->setSrc(1, shift);
248    hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
249 
250    lo->setSrc(0, src[0]);
251    lo->setSrc(1, shift);
252    lo->setSrc(2, src[1]);
253 
254    bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
255 }
256 
257 void
handleSET(CmpInstruction * cmp)258 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
259 {
260    DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
261    Value *carry;
262    Value *src0[2], *src1[2];
263    bld.setPosition(cmp, false);
264 
265    bld.mkSplit(src0, 4, cmp->getSrc(0));
266    bld.mkSplit(src1, 4, cmp->getSrc(1));
267    bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
268       ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
269    cmp->setFlagsSrc(cmp->srcCount(), carry);
270    cmp->setSrc(0, src0[1]);
271    cmp->setSrc(1, src1[1]);
272    cmp->sType = hTy;
273 }
274 
275 bool
visit(Function * fn)276 NVC0LegalizeSSA::visit(Function *fn)
277 {
278    bld.setProgram(fn->getProgram());
279    return true;
280 }
281 
282 bool
visit(BasicBlock * bb)283 NVC0LegalizeSSA::visit(BasicBlock *bb)
284 {
285    Instruction *next;
286    for (Instruction *i = bb->getEntry(); i; i = next) {
287       next = i->next;
288 
289       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
290          handleFTZ(i);
291 
292       switch (i->op) {
293       case OP_DIV:
294       case OP_MOD:
295          if (i->sType != TYPE_F32)
296             handleDIV(i);
297          break;
298       case OP_RCP:
299       case OP_RSQ:
300          if (i->dType == TYPE_F64)
301             handleRCPRSQ(i);
302          break;
303       case OP_TXL:
304       case OP_TXF:
305          handleTEXLOD(i->asTex());
306          break;
307       case OP_SHR:
308       case OP_SHL:
309          if (typeSizeof(i->sType) == 8)
310             handleShift(i);
311          break;
312       case OP_SET:
313       case OP_SET_AND:
314       case OP_SET_OR:
315       case OP_SET_XOR:
316          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
317             handleSET(i->asCmp());
318          break;
319       default:
320          break;
321       }
322    }
323    return true;
324 }
325 
NVC0LegalizePostRA(const Program * prog)326 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
327    : rZero(NULL),
328      carry(NULL),
329      pOne(NULL),
330      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
331                 prog->getTarget()->getChipset() < 0x110)
332 {
333 }
334 
335 bool
insnDominatedBy(const Instruction * later,const Instruction * early) const336 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
337                                     const Instruction *early) const
338 {
339    if (early->bb == later->bb)
340       return early->serial < later->serial;
341    return later->bb->dominatedBy(early->bb);
342 }
343 
344 void
addTexUse(std::list<TexUse> & uses,Instruction * usei,const Instruction * texi)345 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
346                               Instruction *usei, const Instruction *texi)
347 {
348    bool add = true;
349    bool dominated = insnDominatedBy(usei, texi);
350    // Uses before the tex have to all be included. Just because an earlier
351    // instruction dominates another instruction doesn't mean that there's no
352    // way to get from the tex to the later instruction. For example you could
353    // have nested loops, with the tex in the inner loop, and uses before it in
354    // both loops - even though the outer loop's instruction would dominate the
355    // inner's, we still want a texbar before the inner loop's instruction.
356    //
357    // However we can still use the eliding logic between uses dominated by the
358    // tex instruction, as that is unambiguously correct.
359    if (dominated) {
360       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
361          if (it->after) {
362             if (insnDominatedBy(usei, it->insn)) {
363                add = false;
364                break;
365             }
366             if (insnDominatedBy(it->insn, usei)) {
367                it = uses.erase(it);
368                continue;
369             }
370          }
371          ++it;
372       }
373    }
374    if (add)
375       uses.push_back(TexUse(usei, texi, dominated));
376 }
377 
378 // While it might be tempting to use the an algorithm that just looks at tex
379 // uses, not all texture results are guaranteed to be used on all paths. In
380 // the case where along some control flow path a texture result is never used,
381 // we might reuse that register for something else, creating a
382 // write-after-write hazard. So we have to manually look through all
383 // instructions looking for ones that reference the registers in question.
384 void
findFirstUses(Instruction * texi,std::list<TexUse> & uses)385 NVC0LegalizePostRA::findFirstUses(
386    Instruction *texi, std::list<TexUse> &uses)
387 {
388    int minGPR = texi->def(0).rep()->reg.data.id;
389    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
390 
391    unordered_set<const BasicBlock *> visited;
392    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
393 }
394 
395 void
findFirstUsesBB(int minGPR,int maxGPR,Instruction * start,const Instruction * texi,std::list<TexUse> & uses,unordered_set<const BasicBlock * > & visited)396 NVC0LegalizePostRA::findFirstUsesBB(
397    int minGPR, int maxGPR, Instruction *start,
398    const Instruction *texi, std::list<TexUse> &uses,
399    unordered_set<const BasicBlock *> &visited)
400 {
401    const BasicBlock *bb = start->bb;
402 
403    // We don't process the whole bb the first time around. This is correct,
404    // however we might be in a loop and hit this BB again, and need to process
405    // the full thing. So only mark a bb as visited if we processed it from the
406    // beginning.
407    if (start == bb->getEntry()) {
408       if (visited.find(bb) != visited.end())
409          return;
410       visited.insert(bb);
411    }
412 
413    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
414       if (insn->isNop())
415          continue;
416 
417       for (int d = 0; insn->defExists(d); ++d) {
418          const Value *def = insn->def(d).rep();
419          if (insn->def(d).getFile() != FILE_GPR ||
420              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
421              def->reg.data.id > maxGPR)
422             continue;
423          addTexUse(uses, insn, texi);
424          return;
425       }
426 
427       for (int s = 0; insn->srcExists(s); ++s) {
428          const Value *src = insn->src(s).rep();
429          if (insn->src(s).getFile() != FILE_GPR ||
430              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
431              src->reg.data.id > maxGPR)
432             continue;
433          addTexUse(uses, insn, texi);
434          return;
435       }
436    }
437 
438    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
439       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
440                       texi, uses, visited);
441    }
442 }
443 
444 // Texture barriers:
445 // This pass is a bit long and ugly and can probably be optimized.
446 //
447 // 1. obtain a list of TEXes and their outputs' first use(s)
448 // 2. calculate the barrier level of each first use (minimal number of TEXes,
449 //    over all paths, between the TEX and the use in question)
450 // 3. for each barrier, if all paths from the source TEX to that barrier
451 //    contain a barrier of lesser level, it can be culled
452 bool
insertTextureBarriers(Function * fn)453 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
454 {
455    std::list<TexUse> *uses;
456    std::vector<Instruction *> texes;
457    std::vector<int> bbFirstTex;
458    std::vector<int> bbFirstUse;
459    std::vector<int> texCounts;
460    std::vector<TexUse> useVec;
461    ArrayList insns;
462 
463    fn->orderInstructions(insns);
464 
465    texCounts.resize(fn->allBBlocks.getSize(), 0);
466    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
467    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
468 
469    // tag BB CFG nodes by their id for later
470    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
471       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
472       if (bb)
473          bb->cfg.tag = bb->getId();
474    }
475 
476    // gather the first uses for each TEX
477    for (int i = 0; i < insns.getSize(); ++i) {
478       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
479       if (isTextureOp(tex->op)) {
480          texes.push_back(tex);
481          if (!texCounts.at(tex->bb->getId()))
482             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
483          texCounts[tex->bb->getId()]++;
484       }
485    }
486    insns.clear();
487    if (texes.empty())
488       return false;
489    uses = new std::list<TexUse>[texes.size()];
490    if (!uses)
491       return false;
492    for (size_t i = 0; i < texes.size(); ++i) {
493       findFirstUses(texes[i], uses[i]);
494    }
495 
496    // determine the barrier level at each use
497    for (size_t i = 0; i < texes.size(); ++i) {
498       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
499            ++u) {
500          BasicBlock *tb = texes[i]->bb;
501          BasicBlock *ub = u->insn->bb;
502          if (tb == ub) {
503             u->level = 0;
504             for (size_t j = i + 1; j < texes.size() &&
505                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
506                  ++j)
507                u->level++;
508          } else {
509             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
510                                                       &ub->cfg, texCounts);
511             if (u->level < 0) {
512                WARN("Failed to find path TEX -> TEXBAR\n");
513                u->level = 0;
514                continue;
515             }
516             // this counted all TEXes in the origin block, correct that
517             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
518             // and did not count the TEXes in the destination block, add those
519             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
520                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
521                  ++j)
522                u->level++;
523          }
524          assert(u->level >= 0);
525          useVec.push_back(*u);
526       }
527    }
528    delete[] uses;
529 
530    // insert the barriers
531    for (size_t i = 0; i < useVec.size(); ++i) {
532       Instruction *prev = useVec[i].insn->prev;
533       if (useVec[i].level < 0)
534          continue;
535       if (prev && prev->op == OP_TEXBAR) {
536          if (prev->subOp > useVec[i].level)
537             prev->subOp = useVec[i].level;
538          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
539       } else {
540          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
541          bar->fixed = 1;
542          bar->subOp = useVec[i].level;
543          // make use explicit to ease latency calculation
544          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
545          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
546       }
547    }
548 
549    if (fn->getProgram()->optLevel < 3)
550       return true;
551 
552    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
553 
554    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
555    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
556    limitS.resize(fn->allBBlocks.getSize());
557 
558    // cull unneeded barriers (should do that earlier, but for simplicity)
559    IteratorRef bi = fn->cfg.iteratorCFG();
560    // first calculate min/max outstanding TEXes for each BB
561    for (bi->reset(); !bi->end(); bi->next()) {
562       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
563       BasicBlock *bb = BasicBlock::get(n);
564       int min = 0;
565       int max = std::numeric_limits<int>::max();
566       for (Instruction *i = bb->getFirst(); i; i = i->next) {
567          if (isTextureOp(i->op)) {
568             min++;
569             if (max < std::numeric_limits<int>::max())
570                max++;
571          } else
572          if (i->op == OP_TEXBAR) {
573             min = MIN2(min, i->subOp);
574             max = MIN2(max, i->subOp);
575          }
576       }
577       // limits when looking at an isolated block
578       limitS[bb->getId()].min = min;
579       limitS[bb->getId()].max = max;
580    }
581    // propagate the min/max values
582    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
583       for (bi->reset(); !bi->end(); bi->next()) {
584          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
585          BasicBlock *bb = BasicBlock::get(n);
586          const int bbId = bb->getId();
587          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
588             BasicBlock *in = BasicBlock::get(ei.getNode());
589             const int inId = in->getId();
590             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
591             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
592          }
593          // I just hope this is correct ...
594          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
595             // no barrier
596             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
597             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
598          } else {
599             // block contained a barrier
600             limitB[bbId].min = MIN2(limitS[bbId].max,
601                                     limitT[bbId].min + limitS[bbId].min);
602             limitB[bbId].max = MIN2(limitS[bbId].max,
603                                     limitT[bbId].max + limitS[bbId].min);
604          }
605       }
606    }
607    // finally delete unnecessary barriers
608    for (bi->reset(); !bi->end(); bi->next()) {
609       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
610       BasicBlock *bb = BasicBlock::get(n);
611       Instruction *prev = NULL;
612       Instruction *next;
613       int max = limitT[bb->getId()].max;
614       for (Instruction *i = bb->getFirst(); i; i = next) {
615          next = i->next;
616          if (i->op == OP_TEXBAR) {
617             if (i->subOp >= max) {
618                delete_Instruction(prog, i);
619                i = NULL;
620             } else {
621                max = i->subOp;
622                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
623                   delete_Instruction(prog, prev);
624                   prev = NULL;
625                }
626             }
627          } else
628          if (isTextureOp(i->op)) {
629             max++;
630          }
631          if (i && !i->isNop())
632             prev = i;
633       }
634    }
635    return true;
636 }
637 
638 bool
visit(Function * fn)639 NVC0LegalizePostRA::visit(Function *fn)
640 {
641    if (needTexBar)
642       insertTextureBarriers(fn);
643 
644    rZero = new_LValue(fn, FILE_GPR);
645    pOne = new_LValue(fn, FILE_PREDICATE);
646    carry = new_LValue(fn, FILE_FLAGS);
647 
648    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
649    carry->reg.data.id = 0;
650    pOne->reg.data.id = 7;
651 
652    return true;
653 }
654 
655 void
replaceZero(Instruction * i)656 NVC0LegalizePostRA::replaceZero(Instruction *i)
657 {
658    for (int s = 0; i->srcExists(s); ++s) {
659       if (s == 2 && i->op == OP_SUCLAMP)
660          continue;
661       if (s == 1 && i->op == OP_SHLADD)
662          continue;
663       ImmediateValue *imm = i->getSrc(s)->asImm();
664       if (imm) {
665          if (i->op == OP_SELP && s == 2) {
666             i->setSrc(s, pOne);
667             if (imm->reg.data.u64 == 0)
668                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
669          } else if (imm->reg.data.u64 == 0) {
670             i->setSrc(s, rZero);
671          }
672       }
673    }
674 }
675 
676 // replace CONT with BRA for single unconditional continue
677 bool
tryReplaceContWithBra(BasicBlock * bb)678 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
679 {
680    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
681       return false;
682    Graph::EdgeIterator ei = bb->cfg.incident();
683    if (ei.getType() != Graph::Edge::BACK)
684       ei.next();
685    if (ei.getType() != Graph::Edge::BACK)
686       return false;
687    BasicBlock *contBB = BasicBlock::get(ei.getNode());
688 
689    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
690        contBB->getExit()->getPredicate())
691       return false;
692    contBB->getExit()->op = OP_BRA;
693    bb->remove(bb->getEntry()); // delete PRECONT
694 
695    ei.next();
696    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
697    return true;
698 }
699 
700 // replace branches to join blocks with join ops
701 void
propagateJoin(BasicBlock * bb)702 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
703 {
704    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
705       return;
706    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
707       BasicBlock *in = BasicBlock::get(ei.getNode());
708       Instruction *exit = in->getExit();
709       if (!exit) {
710          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
711          // there should always be a terminator instruction
712          WARN("inserted missing terminator in BB:%i\n", in->getId());
713       } else
714       if (exit->op == OP_BRA) {
715          exit->op = OP_JOIN;
716          exit->asFlow()->limit = 1; // must-not-propagate marker
717       }
718    }
719    bb->remove(bb->getEntry());
720 }
721 
722 bool
visit(BasicBlock * bb)723 NVC0LegalizePostRA::visit(BasicBlock *bb)
724 {
725    Instruction *i, *next;
726 
727    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
728    for (i = bb->getFirst(); i; i = next) {
729       next = i->next;
730       if (i->op == OP_EMIT || i->op == OP_RESTART) {
731          if (!i->getDef(0)->refCount())
732             i->setDef(0, NULL);
733          if (i->src(0).getFile() == FILE_IMMEDIATE)
734             i->setSrc(0, rZero); // initial value must be 0
735          replaceZero(i);
736       } else
737       if (i->isNop()) {
738          bb->remove(i);
739       } else
740       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
741           prog->getType() != Program::TYPE_COMPUTE) {
742          // It seems like barriers are never required for tessellation since
743          // the warp size is 32, and there are always at most 32 tcs threads.
744          bb->remove(i);
745       } else
746       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
747          int offset = i->src(0).get()->reg.data.offset;
748          if (abs(offset) >= 0x10000)
749             i->src(0).get()->reg.fileIndex += offset >> 16;
750          i->src(0).get()->reg.data.offset = (int)(short)offset;
751       } else {
752          // TODO: Move this to before register allocation for operations that
753          // need the $c register !
754          if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
755             Instruction *hi;
756             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
757             if (hi)
758                next = hi;
759          }
760 
761          if (i->op != OP_MOV && i->op != OP_PFETCH)
762             replaceZero(i);
763       }
764    }
765    if (!bb->getEntry())
766       return true;
767 
768    if (!tryReplaceContWithBra(bb))
769       propagateJoin(bb);
770 
771    return true;
772 }
773 
NVC0LoweringPass(Program * prog)774 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
775 {
776    bld.setProgram(prog);
777 }
778 
779 bool
visit(Function * fn)780 NVC0LoweringPass::visit(Function *fn)
781 {
782    if (prog->getType() == Program::TYPE_GEOMETRY) {
783       assert(!strncmp(fn->getName(), "MAIN", 4));
784       // TODO: when we generate actual functions pass this value along somehow
785       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
786       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
787       if (fn->cfgExit) {
788          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
789          bld.mkMovToReg(0, gpEmitAddress);
790       }
791    }
792    return true;
793 }
794 
795 bool
visit(BasicBlock * bb)796 NVC0LoweringPass::visit(BasicBlock *bb)
797 {
798    return true;
799 }
800 
801 inline Value *
loadTexHandle(Value * ptr,unsigned int slot)802 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
803 {
804    uint8_t b = prog->driver->io.auxCBSlot;
805    uint32_t off = prog->driver->io.texBindBase + slot * 4;
806 
807    if (ptr)
808       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
809 
810    return bld.
811       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
812 }
813 
814 // move array source to first slot, convert to u16, add indirections
815 bool
handleTEX(TexInstruction * i)816 NVC0LoweringPass::handleTEX(TexInstruction *i)
817 {
818    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
819    const int arg = i->tex.target.getArgCount();
820    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
821    const int chipset = prog->getTarget()->getChipset();
822 
823    /* Only normalize in the non-explicit derivatives case. For explicit
824     * derivatives, this is handled in handleManualTXD.
825     */
826    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
827       Value *src[3], *val;
828       int c;
829       for (c = 0; c < 3; ++c)
830          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
831       val = bld.getScratch();
832       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
833       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
834       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
835       for (c = 0; c < 3; ++c) {
836          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
837                                  i->getSrc(c), val));
838       }
839    }
840 
841    // Arguments to the TEX instruction are a little insane. Even though the
842    // encoding is identical between SM20 and SM30, the arguments mean
843    // different things between Fermi and Kepler+. A lot of arguments are
844    // optional based on flags passed to the instruction. This summarizes the
845    // order of things.
846    //
847    // Fermi:
848    //  array/indirect
849    //  coords
850    //  sample
851    //  lod bias
852    //  depth compare
853    //  offsets:
854    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
855    //    - other: 4 bits each, single reg
856    //
857    // Kepler+:
858    //  indirect handle
859    //  array (+ offsets for txd in upper 16 bits)
860    //  coords
861    //  sample
862    //  lod bias
863    //  depth compare
864    //  offsets (same as fermi, except txd which takes it with array)
865    //
866    // Maxwell (tex):
867    //  array
868    //  coords
869    //  indirect handle
870    //  sample
871    //  lod bias
872    //  depth compare
873    //  offsets
874    //
875    // Maxwell (txd):
876    //  indirect handle
877    //  coords
878    //  array + offsets
879    //  derivatives
880 
881    if (chipset >= NVISA_GK104_CHIPSET) {
882       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
883          // XXX this ignores tsc, and assumes a 1:1 mapping
884          assert(i->tex.rIndirectSrc >= 0);
885          if (!i->tex.bindless) {
886             Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
887             i->tex.r = 0xff;
888             i->tex.s = 0x1f;
889             i->setIndirectR(hnd);
890          }
891          i->setIndirectS(NULL);
892       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
893          if (i->tex.r == 0xffff)
894             i->tex.r = prog->driver->io.fbtexBindBase / 4;
895          else
896             i->tex.r += prog->driver->io.texBindBase / 4;
897          i->tex.s  = 0; // only a single cX[] value possible here
898       } else {
899          Value *hnd = bld.getScratch();
900          Value *rHnd = loadTexHandle(NULL, i->tex.r);
901          Value *sHnd = loadTexHandle(NULL, i->tex.s);
902 
903          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
904 
905          i->tex.r = 0; // not used for indirect tex
906          i->tex.s = 0;
907          i->setIndirectR(hnd);
908       }
909       if (i->tex.target.isArray()) {
910          LValue *layer = new_LValue(func, FILE_GPR);
911          Value *src = i->getSrc(lyr);
912          const int sat = (i->op == OP_TXF) ? 1 : 0;
913          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
914          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
915          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
916             for (int s = dim; s >= 1; --s)
917                i->setSrc(s, i->getSrc(s - 1));
918             i->setSrc(0, layer);
919          } else {
920             i->setSrc(dim, layer);
921          }
922       }
923       // Move the indirect reference to the first place
924       if (i->tex.rIndirectSrc >= 0 && (
925                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
926          Value *hnd = i->getIndirectR();
927 
928          i->setIndirectR(NULL);
929          i->moveSources(0, 1);
930          i->setSrc(0, hnd);
931          i->tex.rIndirectSrc = 0;
932          i->tex.sIndirectSrc = -1;
933       }
934       // Move the indirect reference to right after the coords
935       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
936          Value *hnd = i->getIndirectR();
937 
938          i->setIndirectR(NULL);
939          i->moveSources(arg, 1);
940          i->setSrc(arg, hnd);
941          i->tex.rIndirectSrc = 0;
942          i->tex.sIndirectSrc = -1;
943       }
944    } else
945    // (nvc0) generate and move the tsc/tic/array source to the front
946    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
947       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
948 
949       Value *ticRel = i->getIndirectR();
950       Value *tscRel = i->getIndirectS();
951 
952       if (i->tex.r == 0xffff) {
953          i->tex.r = 0x20;
954          i->tex.s = 0x10;
955       }
956 
957       if (ticRel) {
958          i->setSrc(i->tex.rIndirectSrc, NULL);
959          if (i->tex.r)
960             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
961                                 ticRel, bld.mkImm(i->tex.r));
962       }
963       if (tscRel) {
964          i->setSrc(i->tex.sIndirectSrc, NULL);
965          if (i->tex.s)
966             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
967                                 tscRel, bld.mkImm(i->tex.s));
968       }
969 
970       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
971       if (arrayIndex) {
972          for (int s = dim; s >= 1; --s)
973             i->setSrc(s, i->getSrc(s - 1));
974          i->setSrc(0, arrayIndex);
975       } else {
976          i->moveSources(0, 1);
977       }
978 
979       if (arrayIndex) {
980          int sat = (i->op == OP_TXF) ? 1 : 0;
981          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
982          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
983       } else {
984          bld.loadImm(src, 0);
985       }
986 
987       if (ticRel)
988          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
989       if (tscRel)
990          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
991 
992       i->setSrc(0, src);
993    }
994 
995    // For nvc0, the sample id has to be in the second operand, as the offset
996    // does. Right now we don't know how to pass both in, and this case can't
997    // happen with OpenGL. On nve0, the sample id is part of the texture
998    // coordinate argument.
999    assert(chipset >= NVISA_GK104_CHIPSET ||
1000           !i->tex.useOffsets || !i->tex.target.isMS());
1001 
1002    // offset is between lod and dc
1003    if (i->tex.useOffsets) {
1004       int n, c;
1005       int s = i->srcCount(0xff, true);
1006       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1007          if (i->tex.target.isShadow())
1008             s--;
1009          if (i->srcExists(s)) // move potential predicate out of the way
1010             i->moveSources(s, 1);
1011          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1012             i->moveSources(s + 1, 1);
1013       }
1014       if (i->op == OP_TXG) {
1015          // Either there is 1 offset, which goes into the 2 low bytes of the
1016          // first source, or there are 4 offsets, which go into 2 sources (8
1017          // values, 1 byte each).
1018          Value *offs[2] = {NULL, NULL};
1019          for (n = 0; n < i->tex.useOffsets; n++) {
1020             for (c = 0; c < 2; ++c) {
1021                if ((n % 2) == 0 && c == 0)
1022                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1023                else
1024                   bld.mkOp3(OP_INSBF, TYPE_U32,
1025                             offs[n / 2],
1026                             i->offset[n][c].get(),
1027                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1028                             offs[n / 2]);
1029             }
1030          }
1031          i->setSrc(s, offs[0]);
1032          if (offs[1])
1033             i->setSrc(s + 1, offs[1]);
1034       } else {
1035          unsigned imm = 0;
1036          assert(i->tex.useOffsets == 1);
1037          for (c = 0; c < 3; ++c) {
1038             ImmediateValue val;
1039             if (!i->offset[0][c].getImmediate(val))
1040                assert(!"non-immediate offset passed to non-TXG");
1041             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1042          }
1043          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1044             // The offset goes into the upper 16 bits of the array index. So
1045             // create it if it's not already there, and INSBF it if it already
1046             // is.
1047             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1048             if (chipset >= NVISA_GM107_CHIPSET)
1049                s += dim;
1050             if (i->tex.target.isArray()) {
1051                Value *offset = bld.getScratch();
1052                bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1053                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1054                          i->getSrc(s));
1055                i->setSrc(s, offset);
1056             } else {
1057                i->moveSources(s, 1);
1058                i->setSrc(s, bld.loadImm(NULL, imm << 16));
1059             }
1060          } else {
1061             i->setSrc(s, bld.loadImm(NULL, imm));
1062          }
1063       }
1064    }
1065 
1066    if (chipset >= NVISA_GK104_CHIPSET) {
1067       //
1068       // If TEX requires more than 4 sources, the 2nd register tuple must be
1069       // aligned to 4, even if it consists of just a single 4-byte register.
1070       //
1071       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
1072       //
1073       int s = i->srcCount(0xff, true);
1074       if (s > 4 && s < 7) {
1075          if (i->srcExists(s)) // move potential predicate out of the way
1076             i->moveSources(s, 7 - s);
1077          while (s < 7)
1078             i->setSrc(s++, bld.loadImm(NULL, 0));
1079       }
1080    }
1081 
1082    return true;
1083 }
1084 
1085 bool
handleManualTXD(TexInstruction * i)1086 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1087 {
1088    // Always done from the l0 perspective. This is the way that NVIDIA's
1089    // driver does it, and doing it from the "current" lane's perpsective
1090    // doesn't seem to always work for reasons that aren't altogether clear,
1091    // even in frag shaders.
1092    //
1093    // Note that we must move not only the coordinates into lane0, but also all
1094    // ancillary arguments, like array indices and depth compare as they may
1095    // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1096    // leave them alone.
1097    static const uint8_t qOps[2] =
1098       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1099 
1100    Value *def[4][4];
1101    Value *crd[3], *arr[2], *shadow;
1102    Instruction *tex;
1103    Value *zero = bld.loadImm(bld.getSSA(), 0);
1104    int l, c;
1105    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1106 
1107    // This function is invoked after handleTEX lowering, so we have to expect
1108    // the arguments in the order that the hw wants them. For Fermi, array and
1109    // indirect are both in the leading arg, while for Kepler, array and
1110    // indirect are separate (and both precede the coordinates). Maxwell is
1111    // handled in a separate function.
1112    int array;
1113    if (targ->getChipset() < NVISA_GK104_CHIPSET)
1114       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1115    else
1116       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1117 
1118    i->op = OP_TEX; // no need to clone dPdx/dPdy later
1119 
1120    for (c = 0; c < dim; ++c)
1121       crd[c] = bld.getScratch();
1122    for (c = 0; c < array; ++c)
1123       arr[c] = bld.getScratch();
1124    shadow = bld.getScratch();
1125 
1126    for (l = 0; l < 4; ++l) {
1127       Value *src[3], *val;
1128 
1129       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1130       // we're using the texture result from lane 0 in all cases, so make sure
1131       // that lane 0 is pointing at the proper array index, indirect value,
1132       // and depth compare.
1133       if (l != 0) {
1134          for (c = 0; c < array; ++c)
1135             bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1136          if (i->tex.target.isShadow()) {
1137             // The next argument after coords is the depth compare
1138             bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1139          }
1140       }
1141       // mov position coordinates from lane l to all lanes
1142       for (c = 0; c < dim; ++c)
1143          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1144       // add dPdx from lane l to lanes dx
1145       for (c = 0; c < dim; ++c)
1146          bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1147       // add dPdy from lane l to lanes dy
1148       for (c = 0; c < dim; ++c)
1149          bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1150       // normalize cube coordinates
1151       if (i->tex.target.isCube()) {
1152          for (c = 0; c < 3; ++c)
1153             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1154          val = bld.getScratch();
1155          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1156          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1157          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1158          for (c = 0; c < 3; ++c)
1159             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1160       } else {
1161          for (c = 0; c < dim; ++c)
1162             src[c] = crd[c];
1163       }
1164       // texture
1165       bld.insert(tex = cloneForward(func, i));
1166       if (l != 0) {
1167          for (c = 0; c < array; ++c)
1168             tex->setSrc(c, arr[c]);
1169          if (i->tex.target.isShadow())
1170             tex->setSrc(array + dim, shadow);
1171       }
1172       for (c = 0; c < dim; ++c)
1173          tex->setSrc(c + array, src[c]);
1174       // broadcast results from lane 0 to all lanes so that the moves *into*
1175       // the target lane pick up the proper value.
1176       if (l != 0)
1177          for (c = 0; i->defExists(c); ++c)
1178             bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1179       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1180 
1181       // save results
1182       for (c = 0; i->defExists(c); ++c) {
1183          Instruction *mov;
1184          def[c][l] = bld.getSSA();
1185          mov = bld.mkMov(def[c][l], tex->getDef(c));
1186          mov->fixed = 1;
1187          mov->lanes = 1 << l;
1188       }
1189    }
1190 
1191    for (c = 0; i->defExists(c); ++c) {
1192       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1193       for (l = 0; l < 4; ++l)
1194          u->setSrc(l, def[c][l]);
1195    }
1196 
1197    i->bb->remove(i);
1198    return true;
1199 }
1200 
1201 bool
handleTXD(TexInstruction * txd)1202 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1203 {
1204    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1205    unsigned arg = txd->tex.target.getArgCount();
1206    unsigned expected_args = arg;
1207    const int chipset = prog->getTarget()->getChipset();
1208 
1209    if (chipset >= NVISA_GK104_CHIPSET) {
1210       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1211          expected_args++;
1212       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1213          expected_args++;
1214    } else {
1215       if (txd->tex.useOffsets)
1216          expected_args++;
1217       if (!txd->tex.target.isArray() && (
1218                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1219          expected_args++;
1220    }
1221 
1222    if (expected_args > 4 ||
1223        dim > 2 ||
1224        txd->tex.target.isShadow())
1225       txd->op = OP_TEX;
1226 
1227    handleTEX(txd);
1228    while (txd->srcExists(arg))
1229       ++arg;
1230 
1231    txd->tex.derivAll = true;
1232    if (txd->op == OP_TEX)
1233       return handleManualTXD(txd);
1234 
1235    assert(arg == expected_args);
1236    for (int c = 0; c < dim; ++c) {
1237       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1238       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1239       txd->dPdx[c].set(NULL);
1240       txd->dPdy[c].set(NULL);
1241    }
1242 
1243    // In this case we have fewer than 4 "real" arguments, which means that
1244    // handleTEX didn't apply any padding. However we have to make sure that
1245    // the second "group" of arguments still gets padded up to 4.
1246    if (chipset >= NVISA_GK104_CHIPSET) {
1247       int s = arg + 2 * dim;
1248       if (s >= 4 && s < 7) {
1249          if (txd->srcExists(s)) // move potential predicate out of the way
1250             txd->moveSources(s, 7 - s);
1251          while (s < 7)
1252             txd->setSrc(s++, bld.loadImm(NULL, 0));
1253       }
1254    }
1255 
1256    return true;
1257 }
1258 
1259 bool
handleTXQ(TexInstruction * txq)1260 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1261 {
1262    const int chipset = prog->getTarget()->getChipset();
1263    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1264       txq->tex.r += prog->driver->io.texBindBase / 4;
1265 
1266    if (txq->tex.rIndirectSrc < 0)
1267       return true;
1268 
1269    Value *ticRel = txq->getIndirectR();
1270 
1271    txq->setIndirectS(NULL);
1272    txq->tex.sIndirectSrc = -1;
1273 
1274    assert(ticRel);
1275 
1276    if (chipset < NVISA_GK104_CHIPSET) {
1277       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1278 
1279       txq->setSrc(txq->tex.rIndirectSrc, NULL);
1280       if (txq->tex.r)
1281          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1282                              ticRel, bld.mkImm(txq->tex.r));
1283 
1284       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1285 
1286       txq->moveSources(0, 1);
1287       txq->setSrc(0, src);
1288    } else {
1289       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1290       txq->tex.r = 0xff;
1291       txq->tex.s = 0x1f;
1292 
1293       txq->setIndirectR(NULL);
1294       txq->moveSources(0, 1);
1295       txq->setSrc(0, hnd);
1296       txq->tex.rIndirectSrc = 0;
1297    }
1298 
1299    return true;
1300 }
1301 
1302 bool
handleTXLQ(TexInstruction * i)1303 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1304 {
1305    /* The outputs are inverted compared to what the TGSI instruction
1306     * expects. Take that into account in the mask.
1307     */
1308    assert((i->tex.mask & ~3) == 0);
1309    if (i->tex.mask == 1)
1310       i->tex.mask = 2;
1311    else if (i->tex.mask == 2)
1312       i->tex.mask = 1;
1313    handleTEX(i);
1314    bld.setPosition(i, true);
1315 
1316    /* The returned values are not quite what we want:
1317     * (a) convert from s16/u16 to f32
1318     * (b) multiply by 1/256
1319     */
1320    for (int def = 0; def < 2; ++def) {
1321       if (!i->defExists(def))
1322          continue;
1323       enum DataType type = TYPE_S16;
1324       if (i->tex.mask == 2 || def > 0)
1325          type = TYPE_U16;
1326       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1327       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1328                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1329    }
1330    if (i->tex.mask == 3) {
1331       LValue *t = new_LValue(func, FILE_GPR);
1332       bld.mkMov(t, i->getDef(0));
1333       bld.mkMov(i->getDef(0), i->getDef(1));
1334       bld.mkMov(i->getDef(1), t);
1335    }
1336    return true;
1337 }
1338 
1339 bool
handleBUFQ(Instruction * bufq)1340 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1341 {
1342    bufq->op = OP_MOV;
1343    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1344                                    bufq->getSrc(0)->reg.fileIndex * 16));
1345    bufq->setIndirect(0, 0, NULL);
1346    bufq->setIndirect(0, 1, NULL);
1347    return true;
1348 }
1349 
1350 void
handleSharedATOMNVE4(Instruction * atom)1351 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1352 {
1353    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1354 
1355    BasicBlock *currBB = atom->bb;
1356    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1357    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1358    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1359    BasicBlock *failLockBB = new BasicBlock(func);
1360 
1361    bld.setPosition(currBB, true);
1362    assert(!currBB->joinAt);
1363    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1364 
1365    CmpInstruction *pred =
1366       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1367                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1368 
1369    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1370    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1371 
1372    bld.setPosition(tryLockBB, true);
1373 
1374    Instruction *ld =
1375       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1376                  atom->getIndirect(0, 0));
1377    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1378    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1379 
1380    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1381    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1382    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1383    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1384 
1385    tryLockBB->cfg.detach(&joinBB->cfg);
1386    bld.remove(atom);
1387 
1388    bld.setPosition(setAndUnlockBB, true);
1389    Value *stVal;
1390    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1391       // Read the old value, and write the new one.
1392       stVal = atom->getSrc(1);
1393    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1394       CmpInstruction *set =
1395          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1396                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1397 
1398       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1399                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1400    } else {
1401       operation op;
1402 
1403       switch (atom->subOp) {
1404       case NV50_IR_SUBOP_ATOM_ADD:
1405          op = OP_ADD;
1406          break;
1407       case NV50_IR_SUBOP_ATOM_AND:
1408          op = OP_AND;
1409          break;
1410       case NV50_IR_SUBOP_ATOM_OR:
1411          op = OP_OR;
1412          break;
1413       case NV50_IR_SUBOP_ATOM_XOR:
1414          op = OP_XOR;
1415          break;
1416       case NV50_IR_SUBOP_ATOM_MIN:
1417          op = OP_MIN;
1418          break;
1419       case NV50_IR_SUBOP_ATOM_MAX:
1420          op = OP_MAX;
1421          break;
1422       default:
1423          assert(0);
1424          return;
1425       }
1426 
1427       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1428                          atom->getSrc(1));
1429    }
1430 
1431    Instruction *st =
1432       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1433                   atom->getIndirect(0, 0), stVal);
1434    st->setDef(0, pred->getDef(0));
1435    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1436 
1437    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1438    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1439 
1440    // Lock until the store has not been performed.
1441    bld.setPosition(failLockBB, true);
1442    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1443    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1444    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1445    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1446 
1447    bld.setPosition(joinBB, false);
1448    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1449 }
1450 
1451 void
handleSharedATOM(Instruction * atom)1452 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1453 {
1454    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1455 
1456    BasicBlock *currBB = atom->bb;
1457    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1458    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1459 
1460    bld.setPosition(currBB, true);
1461    assert(!currBB->joinAt);
1462    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1463 
1464    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1465    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1466 
1467    bld.setPosition(tryLockAndSetBB, true);
1468 
1469    Instruction *ld =
1470       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1471                  atom->getIndirect(0, 0));
1472    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1473    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1474 
1475    Value *stVal;
1476    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1477       // Read the old value, and write the new one.
1478       stVal = atom->getSrc(1);
1479    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1480       CmpInstruction *set =
1481          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1482                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1483       set->setPredicate(CC_P, ld->getDef(1));
1484 
1485       Instruction *selp =
1486          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1487                    atom->getSrc(2), set->getDef(0));
1488       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1489       selp->setPredicate(CC_P, ld->getDef(1));
1490 
1491       stVal = selp->getDef(0);
1492    } else {
1493       operation op;
1494 
1495       switch (atom->subOp) {
1496       case NV50_IR_SUBOP_ATOM_ADD:
1497          op = OP_ADD;
1498          break;
1499       case NV50_IR_SUBOP_ATOM_AND:
1500          op = OP_AND;
1501          break;
1502       case NV50_IR_SUBOP_ATOM_OR:
1503          op = OP_OR;
1504          break;
1505       case NV50_IR_SUBOP_ATOM_XOR:
1506          op = OP_XOR;
1507          break;
1508       case NV50_IR_SUBOP_ATOM_MIN:
1509          op = OP_MIN;
1510          break;
1511       case NV50_IR_SUBOP_ATOM_MAX:
1512          op = OP_MAX;
1513          break;
1514       default:
1515          assert(0);
1516          return;
1517       }
1518 
1519       Instruction *i =
1520          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1521                    atom->getSrc(1));
1522       i->setPredicate(CC_P, ld->getDef(1));
1523 
1524       stVal = i->getDef(0);
1525    }
1526 
1527    Instruction *st =
1528       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1529                   atom->getIndirect(0, 0), stVal);
1530    st->setPredicate(CC_P, ld->getDef(1));
1531    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1532 
1533    // Loop until the lock is acquired.
1534    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1535    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1536    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1537    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1538 
1539    bld.remove(atom);
1540 
1541    bld.setPosition(joinBB, false);
1542    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1543 }
1544 
1545 bool
handleATOM(Instruction * atom)1546 NVC0LoweringPass::handleATOM(Instruction *atom)
1547 {
1548    SVSemantic sv;
1549    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1550 
1551    switch (atom->src(0).getFile()) {
1552    case FILE_MEMORY_LOCAL:
1553       sv = SV_LBASE;
1554       break;
1555    case FILE_MEMORY_SHARED:
1556       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1557       // operations on shared memory. For Maxwell, ATOMS is enough.
1558       if (targ->getChipset() < NVISA_GK104_CHIPSET)
1559          handleSharedATOM(atom);
1560       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1561          handleSharedATOMNVE4(atom);
1562       return true;
1563    default:
1564       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1565       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1566       assert(base->reg.size == 8);
1567       if (ptr)
1568          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1569       assert(base->reg.size == 8);
1570       atom->setIndirect(0, 0, base);
1571       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1572 
1573       // Harden against out-of-bounds accesses
1574       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1575       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1576       Value *pred = new_LValue(func, FILE_PREDICATE);
1577       if (ptr)
1578          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1579       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1580       atom->setPredicate(CC_NOT_P, pred);
1581       if (atom->defExists(0)) {
1582          Value *zero, *dst = atom->getDef(0);
1583          atom->setDef(0, bld.getSSA());
1584 
1585          bld.setPosition(atom, true);
1586          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1587             ->setPredicate(CC_P, pred);
1588          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1589       }
1590 
1591       return true;
1592    }
1593    base =
1594       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1595 
1596    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1597    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1598    if (ptr)
1599       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1600    atom->setIndirect(0, 1, NULL);
1601    atom->setIndirect(0, 0, base);
1602 
1603    return true;
1604 }
1605 
1606 bool
handleCasExch(Instruction * cas,bool needCctl)1607 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1608 {
1609    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1610       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1611          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1612          return false;
1613       }
1614    }
1615 
1616    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1617        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1618       return false;
1619    bld.setPosition(cas, true);
1620 
1621    if (needCctl) {
1622       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1623       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1624       cctl->fixed = 1;
1625       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1626       if (cas->isPredicated())
1627          cctl->setPredicate(cas->cc, cas->getPredicate());
1628    }
1629 
1630    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1631       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1632       // should be set to the high part of the double reg or bad things will
1633       // happen elsewhere in the universe.
1634       // Also, it sometimes returns the new value instead of the old one
1635       // under mysterious circumstances.
1636       Value *dreg = bld.getSSA(8);
1637       bld.setPosition(cas, false);
1638       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1639       cas->setSrc(1, dreg);
1640       cas->setSrc(2, dreg);
1641    }
1642 
1643    return true;
1644 }
1645 
1646 inline Value *
loadResInfo32(Value * ptr,uint32_t off,uint16_t base)1647 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1648 {
1649    uint8_t b = prog->driver->io.auxCBSlot;
1650    off += base;
1651 
1652    return bld.
1653       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1654 }
1655 
1656 inline Value *
loadResInfo64(Value * ptr,uint32_t off,uint16_t base)1657 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1658 {
1659    uint8_t b = prog->driver->io.auxCBSlot;
1660    off += base;
1661 
1662    if (ptr)
1663       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1664 
1665    return bld.
1666       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1667 }
1668 
1669 inline Value *
loadResLength32(Value * ptr,uint32_t off,uint16_t base)1670 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1671 {
1672    uint8_t b = prog->driver->io.auxCBSlot;
1673    off += base;
1674 
1675    if (ptr)
1676       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1677 
1678    return bld.
1679       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1680 }
1681 
1682 inline Value *
loadBufInfo64(Value * ptr,uint32_t off)1683 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1684 {
1685    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1686 }
1687 
1688 inline Value *
loadBufLength32(Value * ptr,uint32_t off)1689 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1690 {
1691    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1692 }
1693 
1694 inline Value *
loadUboInfo64(Value * ptr,uint32_t off)1695 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1696 {
1697    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1698 }
1699 
1700 inline Value *
loadUboLength32(Value * ptr,uint32_t off)1701 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1702 {
1703    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1704 }
1705 
1706 inline Value *
loadMsInfo32(Value * ptr,uint32_t off)1707 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1708 {
1709    uint8_t b = prog->driver->io.msInfoCBSlot;
1710    off += prog->driver->io.msInfoBase;
1711    return bld.
1712       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1713 }
1714 
1715 /* On nvc0, surface info is obtained via the surface binding points passed
1716  * to the SULD/SUST instructions.
1717  * On nve4, surface info is stored in c[] and is used by various special
1718  * instructions, e.g. for clamping coordinates or generating an address.
1719  * They couldn't just have added an equivalent to TIC now, couldn't they ?
1720  */
1721 #define NVC0_SU_INFO_ADDR   0x00
1722 #define NVC0_SU_INFO_FMT    0x04
1723 #define NVC0_SU_INFO_DIM_X  0x08
1724 #define NVC0_SU_INFO_PITCH  0x0c
1725 #define NVC0_SU_INFO_DIM_Y  0x10
1726 #define NVC0_SU_INFO_ARRAY  0x14
1727 #define NVC0_SU_INFO_DIM_Z  0x18
1728 #define NVC0_SU_INFO_UNK1C  0x1c
1729 #define NVC0_SU_INFO_WIDTH  0x20
1730 #define NVC0_SU_INFO_HEIGHT 0x24
1731 #define NVC0_SU_INFO_DEPTH  0x28
1732 #define NVC0_SU_INFO_TARGET 0x2c
1733 #define NVC0_SU_INFO_BSIZE  0x30
1734 #define NVC0_SU_INFO_RAW_X  0x34
1735 #define NVC0_SU_INFO_MS_X   0x38
1736 #define NVC0_SU_INFO_MS_Y   0x3c
1737 
1738 #define NVC0_SU_INFO__STRIDE 0x40
1739 
1740 #define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
1741 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1742 #define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
1743 
1744 inline Value *
loadSuInfo32(Value * ptr,int slot,uint32_t off,bool bindless)1745 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1746 {
1747    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1748 
1749    if (ptr) {
1750       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1751       if (bindless)
1752          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1753       else
1754          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1755       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1756       base = 0;
1757    }
1758    off += base;
1759 
1760    return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1761                         prog->driver->io.suInfoBase);
1762 }
1763 
getSuClampSubOp(const TexInstruction * su,int c)1764 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1765 {
1766    switch (su->tex.target.getEnum()) {
1767    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1768    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1769    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1770    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1771                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1772                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1773    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1774    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1775    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1776    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1777    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1778    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1779    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1780    default:
1781       assert(0);
1782       return 0;
1783    }
1784 }
1785 
1786 bool
handleSUQ(TexInstruction * suq)1787 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1788 {
1789    int mask = suq->tex.mask;
1790    int dim = suq->tex.target.getDim();
1791    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1792    Value *ind = suq->getIndirectR();
1793    int slot = suq->tex.r;
1794    int c, d;
1795 
1796    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1797       if (c >= arg || !(mask & 1))
1798          continue;
1799 
1800       int offset;
1801 
1802       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1803          offset = NVC0_SU_INFO_SIZE(2);
1804       } else {
1805          offset = NVC0_SU_INFO_SIZE(c);
1806       }
1807       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1808       if (c == 2 && suq->tex.target.isCube())
1809          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1810                    bld.loadImm(NULL, 6));
1811    }
1812 
1813    if (mask & 1) {
1814       if (suq->tex.target.isMS()) {
1815          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1816          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1817          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1818          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1819       } else {
1820          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1821       }
1822    }
1823 
1824    bld.remove(suq);
1825    return true;
1826 }
1827 
1828 void
adjustCoordinatesMS(TexInstruction * tex)1829 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1830 {
1831    const int arg = tex->tex.target.getArgCount();
1832    int slot = tex->tex.r;
1833 
1834    if (tex->tex.target == TEX_TARGET_2D_MS)
1835       tex->tex.target = TEX_TARGET_2D;
1836    else
1837    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1838       tex->tex.target = TEX_TARGET_2D_ARRAY;
1839    else
1840       return;
1841 
1842    Value *x = tex->getSrc(0);
1843    Value *y = tex->getSrc(1);
1844    Value *s = tex->getSrc(arg - 1);
1845 
1846    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1847    Value *ind = tex->getIndirectR();
1848 
1849    Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless);
1850    Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless);
1851 
1852    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1853    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1854 
1855    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1856    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1857 
1858    Value *dx = loadMsInfo32(ts, 0x0);
1859    Value *dy = loadMsInfo32(ts, 0x4);
1860 
1861    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1862    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1863 
1864    tex->setSrc(0, tx);
1865    tex->setSrc(1, ty);
1866    tex->moveSources(arg, -1);
1867 }
1868 
1869 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1870 // They're computed from the coordinates using the surface info in c[] space.
1871 void
processSurfaceCoordsNVE4(TexInstruction * su)1872 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1873 {
1874    Instruction *insn;
1875    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1876    const bool raw =
1877       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1878    const int slot = su->tex.r;
1879    const int dim = su->tex.target.getDim();
1880    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1881    int c;
1882    Value *zero = bld.mkImm(0);
1883    Value *p1 = NULL;
1884    Value *v;
1885    Value *src[3];
1886    Value *bf, *eau, *off;
1887    Value *addr, *pred;
1888    Value *ind = su->getIndirectR();
1889 
1890    off = bld.getScratch(4);
1891    bf = bld.getScratch(4);
1892    addr = bld.getSSA(8);
1893    pred = bld.getScratch(1, FILE_PREDICATE);
1894 
1895    bld.setPosition(su, false);
1896 
1897    adjustCoordinatesMS(su);
1898 
1899    // calculate clamped coordinates
1900    for (c = 0; c < arg; ++c) {
1901       int dimc = c;
1902 
1903       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
1904          // The array index is stored in the Z component for 1D arrays.
1905          dimc = 2;
1906       }
1907 
1908       src[c] = bld.getScratch();
1909       if (c == 0 && raw)
1910          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
1911       else
1912          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
1913       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1914          ->subOp = getSuClampSubOp(su, dimc);
1915    }
1916    for (; c < 3; ++c)
1917       src[c] = zero;
1918 
1919    // set predicate output
1920    if (su->tex.target == TEX_TARGET_BUFFER) {
1921       src[0]->getInsn()->setFlagsDef(1, pred);
1922    } else
1923    if (su->tex.target.isArray() || su->tex.target.isCube()) {
1924       p1 = bld.getSSA(1, FILE_PREDICATE);
1925       src[dim]->getInsn()->setFlagsDef(1, p1);
1926    }
1927 
1928    // calculate pixel offset
1929    if (dim == 1) {
1930       if (su->tex.target != TEX_TARGET_BUFFER)
1931          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1932    } else
1933    if (dim == 3) {
1934       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
1935       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1936          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1937 
1938       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
1939       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1940          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1941    } else {
1942       assert(dim == 2);
1943       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
1944       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1945          ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
1946          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1947    }
1948 
1949    // calculate effective address part 1
1950    if (su->tex.target == TEX_TARGET_BUFFER) {
1951       if (raw) {
1952          bf = src[0];
1953       } else {
1954          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
1955          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1956             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1957       }
1958    } else {
1959       Value *y = src[1];
1960       Value *z = src[2];
1961       uint16_t subOp = 0;
1962 
1963       switch (dim) {
1964       case 1:
1965          y = zero;
1966          z = zero;
1967          break;
1968       case 2:
1969          z = off;
1970          if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
1971             z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
1972             subOp = NV50_IR_SUBOP_SUBFM_3D;
1973          }
1974          break;
1975       default:
1976          subOp = NV50_IR_SUBOP_SUBFM_3D;
1977          assert(dim == 3);
1978          break;
1979       }
1980       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1981       insn->subOp = subOp;
1982       insn->setFlagsDef(1, pred);
1983    }
1984 
1985    // part 2
1986    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
1987 
1988    if (su->tex.target == TEX_TARGET_BUFFER) {
1989       eau = v;
1990    } else {
1991       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1992    }
1993    // add array layer offset
1994    if (su->tex.target.isArray() || su->tex.target.isCube()) {
1995       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
1996       if (dim == 1)
1997          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1998             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1999       else
2000          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2001             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2002       // combine predicates
2003       assert(p1);
2004       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2005    }
2006 
2007    if (atom) {
2008       Value *lo = bf;
2009       if (su->tex.target == TEX_TARGET_BUFFER) {
2010          lo = zero;
2011          bld.mkMov(off, bf);
2012       }
2013       //  bf == g[] address & 0xff
2014       // eau == g[] address >> 8
2015       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2016       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2017    } else
2018    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2019       // Convert from u32 to u8 address format, which is what the library code
2020       // doing SULDP currently uses.
2021       // XXX: can SUEAU do this ?
2022       // XXX: does it matter that we don't mask high bytes in bf ?
2023       // Grrr.
2024       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2025       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2026    }
2027 
2028    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2029 
2030    if (atom && su->tex.target == TEX_TARGET_BUFFER)
2031       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2032 
2033    // let's just set it 0 for raw access and hope it works
2034    v = raw ?
2035       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2036 
2037    // get rid of old coordinate sources, make space for fmt info and predicate
2038    su->moveSources(arg, 3 - arg);
2039    // set 64 bit address and 32-bit format sources
2040    su->setSrc(0, addr);
2041    su->setSrc(1, v);
2042    su->setSrc(2, pred);
2043    su->setIndirectR(NULL);
2044 
2045    // prevent read fault when the image is not actually bound
2046    CmpInstruction *pred1 =
2047       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2048                 TYPE_U32, bld.mkImm(0),
2049                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2050 
2051    if (su->op != OP_SUSTP && su->tex.format) {
2052       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2053       int blockwidth = format->bits[0] + format->bits[1] +
2054                        format->bits[2] + format->bits[3];
2055 
2056       // make sure that the format doesn't mismatch
2057       assert(format->components != 0);
2058       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2059                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2060                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2061                 pred1->getDef(0));
2062    }
2063    su->setPredicate(CC_NOT_P, pred1->getDef(0));
2064 
2065    // TODO: initialize def values to 0 when the surface operation is not
2066    // performed (not needed for stores). Also, fix the "address bounds test"
2067    // subtests from arb_shader_image_load_store-invalid for buffers, because it
2068    // seems like that the predicate is not correctly set by suclamp.
2069 }
2070 
2071 static DataType
getSrcType(const TexInstruction::ImgFormatDesc * t,int c)2072 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2073 {
2074    switch (t->type) {
2075    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2076    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2077    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2078    case UINT:
2079       return (t->bits[c] == 8 ? TYPE_U8 :
2080               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2081    case SINT:
2082       return (t->bits[c] == 8 ? TYPE_S8 :
2083               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2084    }
2085    return TYPE_NONE;
2086 }
2087 
2088 static DataType
getDestType(const ImgType type)2089 getDestType(const ImgType type) {
2090    switch (type) {
2091    case FLOAT:
2092    case UNORM:
2093    case SNORM:
2094       return TYPE_F32;
2095    case UINT:
2096       return TYPE_U32;
2097    case SINT:
2098       return TYPE_S32;
2099    default:
2100       assert(!"Impossible type");
2101       return TYPE_NONE;
2102    }
2103 }
2104 
2105 void
convertSurfaceFormat(TexInstruction * su)2106 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
2107 {
2108    const TexInstruction::ImgFormatDesc *format = su->tex.format;
2109    int width = format->bits[0] + format->bits[1] +
2110       format->bits[2] + format->bits[3];
2111    Value *untypedDst[4] = {};
2112    Value *typedDst[4] = {};
2113 
2114    // We must convert this to a generic load.
2115    su->op = OP_SULDB;
2116 
2117    su->dType = typeOfSize(width / 8);
2118    su->sType = TYPE_U8;
2119 
2120    for (int i = 0; i < width / 32; i++)
2121       untypedDst[i] = bld.getSSA();
2122    if (width < 32)
2123       untypedDst[0] = bld.getSSA();
2124 
2125    for (int i = 0; i < 4; i++) {
2126       typedDst[i] = su->getDef(i);
2127    }
2128 
2129    // Set the untyped dsts as the su's destinations
2130    for (int i = 0; i < 4; i++)
2131       su->setDef(i, untypedDst[i]);
2132 
2133    bld.setPosition(su, true);
2134 
2135    // Unpack each component into the typed dsts
2136    int bits = 0;
2137    for (int i = 0; i < 4; bits += format->bits[i], i++) {
2138       if (!typedDst[i])
2139          continue;
2140       if (i >= format->components) {
2141          if (format->type == FLOAT ||
2142              format->type == UNORM ||
2143              format->type == SNORM)
2144             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2145          else
2146             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2147          continue;
2148       }
2149 
2150       // Get just that component's data into the relevant place
2151       if (format->bits[i] == 32)
2152          bld.mkMov(typedDst[i], untypedDst[i]);
2153       else if (format->bits[i] == 16)
2154          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2155                    getSrcType(format, i), untypedDst[i / 2])
2156          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2157       else if (format->bits[i] == 8)
2158          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2159                    getSrcType(format, i), untypedDst[0])->subOp = i;
2160       else {
2161          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2162                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2163          if (format->type == UNORM || format->type == SNORM)
2164             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2165       }
2166 
2167       // Normalize / convert as necessary
2168       if (format->type == UNORM)
2169          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2170       else if (format->type == SNORM)
2171          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2172       else if (format->type == FLOAT && format->bits[i] < 16) {
2173          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2174          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2175       }
2176    }
2177 
2178    if (format->bgra) {
2179       std::swap(typedDst[0], typedDst[2]);
2180    }
2181 }
2182 
2183 void
handleSurfaceOpNVE4(TexInstruction * su)2184 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2185 {
2186    processSurfaceCoordsNVE4(su);
2187 
2188    if (su->op == OP_SULDP)
2189       convertSurfaceFormat(su);
2190 
2191    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2192       assert(su->getPredicate());
2193       Value *pred =
2194          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2195                     su->getPredicate(), su->getSrc(2));
2196 
2197       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2198       red->subOp = su->subOp;
2199       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2200       red->setSrc(1, su->getSrc(3));
2201       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2202          red->setSrc(2, su->getSrc(4));
2203       red->setIndirect(0, 0, su->getSrc(0));
2204 
2205       // make sure to initialize dst value when the atomic operation is not
2206       // performed
2207       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2208 
2209       assert(su->cc == CC_NOT_P);
2210       red->setPredicate(su->cc, pred);
2211       mov->setPredicate(CC_P, pred);
2212 
2213       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2214                 red->getDef(0), mov->getDef(0));
2215 
2216       delete_Instruction(bld.getProgram(), su);
2217       handleCasExch(red, true);
2218    }
2219 
2220    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2221       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2222 }
2223 
2224 void
processSurfaceCoordsNVC0(TexInstruction * su)2225 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2226 {
2227    const int slot = su->tex.r;
2228    const int dim = su->tex.target.getDim();
2229    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2230    int c;
2231    Value *zero = bld.mkImm(0);
2232    Value *src[3];
2233    Value *v;
2234    Value *ind = su->getIndirectR();
2235 
2236    bld.setPosition(su, false);
2237 
2238    adjustCoordinatesMS(su);
2239 
2240    if (ind) {
2241       Value *ptr;
2242       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2243       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2244       su->setIndirectR(ptr);
2245    }
2246 
2247    // get surface coordinates
2248    for (c = 0; c < arg; ++c)
2249       src[c] = su->getSrc(c);
2250    for (; c < 3; ++c)
2251       src[c] = zero;
2252 
2253    // calculate pixel offset
2254    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2255       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2256       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2257    }
2258 
2259    // add array layer offset
2260    if (su->tex.target.isArray() || su->tex.target.isCube()) {
2261       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2262       assert(dim > 1);
2263       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2264    }
2265 
2266    // prevent read fault when the image is not actually bound
2267    CmpInstruction *pred =
2268       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2269                 TYPE_U32, bld.mkImm(0),
2270                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2271    if (su->op != OP_SUSTP && su->tex.format) {
2272       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2273       int blockwidth = format->bits[0] + format->bits[1] +
2274                        format->bits[2] + format->bits[3];
2275 
2276       assert(format->components != 0);
2277       // make sure that the format doesn't mismatch when it's not FMT_NONE
2278       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2279                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2280                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2281                 pred->getDef(0));
2282    }
2283    su->setPredicate(CC_NOT_P, pred->getDef(0));
2284 }
2285 
2286 void
handleSurfaceOpNVC0(TexInstruction * su)2287 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2288 {
2289    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2290       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2291        * will simplify the lowering pass and the texture constraints. */
2292       su->moveSources(1, 1);
2293       su->setSrc(1, bld.loadImm(NULL, 0));
2294       su->tex.target = TEX_TARGET_2D_ARRAY;
2295    }
2296 
2297    processSurfaceCoordsNVC0(su);
2298 
2299    if (su->op == OP_SULDP)
2300       convertSurfaceFormat(su);
2301 
2302    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2303       const int dim = su->tex.target.getDim();
2304       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2305       LValue *addr = bld.getSSA(8);
2306       Value *def = su->getDef(0);
2307 
2308       su->op = OP_SULEA;
2309 
2310       // Set the destination to the address
2311       su->dType = TYPE_U64;
2312       su->setDef(0, addr);
2313       su->setDef(1, su->getPredicate());
2314 
2315       bld.setPosition(su, true);
2316 
2317       // Perform the atomic op
2318       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2319       red->subOp = su->subOp;
2320       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2321       red->setSrc(1, su->getSrc(arg));
2322       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2323          red->setSrc(2, su->getSrc(arg + 1));
2324       red->setIndirect(0, 0, addr);
2325 
2326       // make sure to initialize dst value when the atomic operation is not
2327       // performed
2328       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2329 
2330       assert(su->cc == CC_NOT_P);
2331       red->setPredicate(su->cc, su->getPredicate());
2332       mov->setPredicate(CC_P, su->getPredicate());
2333 
2334       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2335 
2336       handleCasExch(red, false);
2337    }
2338 }
2339 
2340 void
processSurfaceCoordsGM107(TexInstruction * su)2341 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2342 {
2343    const int slot = su->tex.r;
2344    const int dim = su->tex.target.getDim();
2345    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2346    Value *ind = su->getIndirectR();
2347    int pos = 0;
2348 
2349    bld.setPosition(su, false);
2350 
2351    // add texture handle
2352    switch (su->op) {
2353    case OP_SUSTP:
2354       pos = 4;
2355       break;
2356    case OP_SUREDP:
2357       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2358       break;
2359    default:
2360       assert(pos == 0);
2361       break;
2362    }
2363    su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
2364 
2365    // prevent read fault when the image is not actually bound
2366    CmpInstruction *pred =
2367       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2368                 TYPE_U32, bld.mkImm(0),
2369                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2370    if (su->op != OP_SUSTP && su->tex.format) {
2371       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2372       int blockwidth = format->bits[0] + format->bits[1] +
2373                        format->bits[2] + format->bits[3];
2374 
2375       assert(format->components != 0);
2376       // make sure that the format doesn't mismatch when it's not FMT_NONE
2377       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2378                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2379                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2380                 pred->getDef(0));
2381    }
2382    su->setPredicate(CC_NOT_P, pred->getDef(0));
2383 }
2384 
2385 void
handleSurfaceOpGM107(TexInstruction * su)2386 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2387 {
2388    processSurfaceCoordsGM107(su);
2389 
2390    if (su->op == OP_SULDP)
2391       convertSurfaceFormat(su);
2392 
2393    if (su->op == OP_SUREDP) {
2394       Value *def = su->getDef(0);
2395 
2396       su->op = OP_SUREDB;
2397       su->setDef(0, bld.getSSA());
2398 
2399       bld.setPosition(su, true);
2400 
2401       // make sure to initialize dst value when the atomic operation is not
2402       // performed
2403       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2404 
2405       assert(su->cc == CC_NOT_P);
2406       mov->setPredicate(CC_P, su->getPredicate());
2407 
2408       bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2409    }
2410 }
2411 
2412 bool
handleWRSV(Instruction * i)2413 NVC0LoweringPass::handleWRSV(Instruction *i)
2414 {
2415    Instruction *st;
2416    Symbol *sym;
2417    uint32_t addr;
2418 
2419    // must replace, $sreg are not writeable
2420    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2421    if (addr >= 0x400)
2422       return false;
2423    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2424 
2425    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2426                     i->getSrc(1));
2427    st->perPatch = i->perPatch;
2428 
2429    bld.getBB()->remove(i);
2430    return true;
2431 }
2432 
2433 void
handleLDST(Instruction * i)2434 NVC0LoweringPass::handleLDST(Instruction *i)
2435 {
2436    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2437       if (prog->getType() == Program::TYPE_COMPUTE) {
2438          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2439          i->getSrc(0)->reg.fileIndex = 0;
2440       } else
2441       if (prog->getType() == Program::TYPE_GEOMETRY &&
2442           i->src(0).isIndirect(0)) {
2443          // XXX: this assumes vec4 units
2444          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2445                                  i->getIndirect(0, 0), bld.mkImm(4));
2446          i->setIndirect(0, 0, ptr);
2447          i->op = OP_VFETCH;
2448       } else {
2449          i->op = OP_VFETCH;
2450          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2451       }
2452    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2453       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2454           prog->getType() == Program::TYPE_COMPUTE) {
2455          // The launch descriptor only allows to set up 8 CBs, but OpenGL
2456          // requires at least 12 UBOs. To bypass this limitation, we store the
2457          // addrs into the driver constbuf and we directly load from the global
2458          // memory.
2459          int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2460          Value *ind = i->getIndirect(0, 1);
2461 
2462          if (!ind && fileIndex == -1)
2463             return;
2464 
2465          if (ind) {
2466             // Clamp the UBO index when an indirect access is used to avoid
2467             // loading information from the wrong place in the driver cb.
2468             // TODO - synchronize the max with the driver.
2469             ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2470                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2471                                         ind, bld.loadImm(NULL, fileIndex)),
2472                              bld.loadImm(NULL, 13));
2473             fileIndex = 0;
2474          }
2475 
2476          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2477          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2478          Value *length = loadUboLength32(ind, fileIndex * 16);
2479          Value *pred = new_LValue(func, FILE_PREDICATE);
2480          if (i->src(0).isIndirect(0)) {
2481             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2482             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2483          }
2484          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2485          i->setIndirect(0, 1, NULL);
2486          i->setIndirect(0, 0, ptr);
2487          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2488          i->setPredicate(CC_NOT_P, pred);
2489          Value *zero, *dst = i->getDef(0);
2490          i->setDef(0, bld.getSSA());
2491 
2492          bld.setPosition(i, true);
2493          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2494             ->setPredicate(CC_P, pred);
2495          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2496       } else if (i->src(0).isIndirect(1)) {
2497          Value *ptr;
2498          if (i->src(0).isIndirect(0))
2499             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2500                              i->getIndirect(0, 1), bld.mkImm(0x1010),
2501                              i->getIndirect(0, 0));
2502          else
2503             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2504                              i->getIndirect(0, 1), bld.mkImm(16));
2505          i->setIndirect(0, 1, NULL);
2506          i->setIndirect(0, 0, ptr);
2507          i->subOp = NV50_IR_SUBOP_LDC_IS;
2508       }
2509    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2510       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2511       i->op = OP_VFETCH;
2512    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2513       Value *ind = i->getIndirect(0, 1);
2514       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2515       // XXX come up with a way not to do this for EVERY little access but
2516       // rather to batch these up somehow. Unfortunately we've lost the
2517       // information about the field width by the time we get here.
2518       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2519       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2520       Value *pred = new_LValue(func, FILE_PREDICATE);
2521       if (i->src(0).isIndirect(0)) {
2522          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2523          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2524       }
2525       i->setIndirect(0, 1, NULL);
2526       i->setIndirect(0, 0, ptr);
2527       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2528       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2529       i->setPredicate(CC_NOT_P, pred);
2530       if (i->defExists(0)) {
2531          Value *zero, *dst = i->getDef(0);
2532          i->setDef(0, bld.getSSA());
2533 
2534          bld.setPosition(i, true);
2535          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2536             ->setPredicate(CC_P, pred);
2537          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2538       }
2539    }
2540 }
2541 
2542 void
readTessCoord(LValue * dst,int c)2543 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2544 {
2545    Value *laneid = bld.getSSA();
2546    Value *x, *y;
2547 
2548    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2549 
2550    if (c == 0) {
2551       x = dst;
2552       y = NULL;
2553    } else
2554    if (c == 1) {
2555       x = NULL;
2556       y = dst;
2557    } else {
2558       assert(c == 2);
2559       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2560          bld.mkMov(dst, bld.loadImm(NULL, 0));
2561          return;
2562       }
2563       x = bld.getSSA();
2564       y = bld.getSSA();
2565    }
2566    if (x)
2567       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2568    if (y)
2569       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2570 
2571    if (c == 2) {
2572       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2573       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2574    }
2575 }
2576 
2577 bool
handleRDSV(Instruction * i)2578 NVC0LoweringPass::handleRDSV(Instruction *i)
2579 {
2580    Symbol *sym = i->getSrc(0)->asSym();
2581    const SVSemantic sv = sym->reg.data.sv.sv;
2582    Value *vtx = NULL;
2583    Instruction *ld;
2584    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2585 
2586    if (addr >= 0x400) {
2587       // mov $sreg
2588       if (sym->reg.data.sv.index == 3) {
2589          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2590          i->op = OP_MOV;
2591          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2592       }
2593       if (sv == SV_VERTEX_COUNT) {
2594          bld.setPosition(i, true);
2595          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2596       }
2597       return true;
2598    }
2599 
2600    switch (sv) {
2601    case SV_POSITION:
2602       assert(prog->getType() == Program::TYPE_FRAGMENT);
2603       if (i->srcExists(1)) {
2604          // Pass offset through to the interpolation logic
2605          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2606                            i->getDef(0), addr, NULL);
2607          ld->setSrc(1, i->getSrc(1));
2608       } else {
2609          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2610       }
2611       break;
2612    case SV_FACE:
2613    {
2614       Value *face = i->getDef(0);
2615       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2616       if (i->dType == TYPE_F32) {
2617          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2618          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2619          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2620       }
2621    }
2622       break;
2623    case SV_TESS_COORD:
2624       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2625       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2626       break;
2627    case SV_NTID:
2628    case SV_NCTAID:
2629    case SV_GRIDID:
2630       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2631       if (sym->reg.data.sv.index == 3) {
2632          i->op = OP_MOV;
2633          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2634          return true;
2635       }
2636       // Fallthrough
2637    case SV_WORK_DIM:
2638       addr += prog->driver->prop.cp.gridInfoBase;
2639       bld.mkLoad(TYPE_U32, i->getDef(0),
2640                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2641                               TYPE_U32, addr), NULL);
2642       break;
2643    case SV_SAMPLE_INDEX:
2644       // TODO: Properly pass source as an address in the PIX address space
2645       // (which can be of the form [r0+offset]). But this is currently
2646       // unnecessary.
2647       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2648       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2649       break;
2650    case SV_SAMPLE_POS: {
2651       Value *off = new_LValue(func, FILE_GPR);
2652       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2653       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2654       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
2655       bld.mkLoad(TYPE_F32,
2656                  i->getDef(0),
2657                  bld.mkSymbol(
2658                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2659                        TYPE_U32, prog->driver->io.sampleInfoBase +
2660                        4 * sym->reg.data.sv.index),
2661                  off);
2662       break;
2663    }
2664    case SV_SAMPLE_MASK: {
2665       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2666       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2667       Instruction *sampleid =
2668          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2669       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2670       Value *masked =
2671          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2672                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2673                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
2674       if (prog->driver->prop.fp.persampleInvocation) {
2675          bld.mkMov(i->getDef(0), masked);
2676       } else {
2677          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2678                    bld.mkImm(0))
2679             ->subOp = 1;
2680       }
2681       break;
2682    }
2683    case SV_BASEVERTEX:
2684    case SV_BASEINSTANCE:
2685    case SV_DRAWID:
2686       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2687                       bld.mkSymbol(FILE_MEMORY_CONST,
2688                                    prog->driver->io.auxCBSlot,
2689                                    TYPE_U32,
2690                                    prog->driver->io.drawInfoBase +
2691                                    4 * (sv - SV_BASEVERTEX)),
2692                       NULL);
2693       break;
2694    default:
2695       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2696          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2697       if (prog->getType() == Program::TYPE_FRAGMENT) {
2698          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2699       } else {
2700          ld = bld.mkFetch(i->getDef(0), i->dType,
2701                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2702          ld->perPatch = i->perPatch;
2703       }
2704       break;
2705    }
2706    bld.getBB()->remove(i);
2707    return true;
2708 }
2709 
2710 bool
handleDIV(Instruction * i)2711 NVC0LoweringPass::handleDIV(Instruction *i)
2712 {
2713    if (!isFloatType(i->dType))
2714       return true;
2715    bld.setPosition(i, false);
2716    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2717    i->op = OP_MUL;
2718    i->setSrc(1, rcp->getDef(0));
2719    return true;
2720 }
2721 
2722 bool
handleMOD(Instruction * i)2723 NVC0LoweringPass::handleMOD(Instruction *i)
2724 {
2725    if (!isFloatType(i->dType))
2726       return true;
2727    LValue *value = bld.getScratch(typeSizeof(i->dType));
2728    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2729    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2730    bld.mkOp1(OP_TRUNC, i->dType, value, value);
2731    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2732    i->op = OP_SUB;
2733    i->setSrc(1, value);
2734    return true;
2735 }
2736 
2737 bool
handleSQRT(Instruction * i)2738 NVC0LoweringPass::handleSQRT(Instruction *i)
2739 {
2740    if (i->dType == TYPE_F64) {
2741       Value *pred = bld.getSSA(1, FILE_PREDICATE);
2742       Value *zero = bld.loadImm(NULL, 0.0);
2743       Value *dst = bld.getSSA(8);
2744       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2745       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2746       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2747       i->op = OP_MUL;
2748       i->setSrc(1, dst);
2749       // TODO: Handle this properly with a library function
2750    } else {
2751       bld.setPosition(i, true);
2752       i->op = OP_RSQ;
2753       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2754    }
2755 
2756    return true;
2757 }
2758 
2759 bool
handlePOW(Instruction * i)2760 NVC0LoweringPass::handlePOW(Instruction *i)
2761 {
2762    LValue *val = bld.getScratch();
2763 
2764    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2765    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2766    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2767 
2768    i->op = OP_EX2;
2769    i->setSrc(0, val);
2770    i->setSrc(1, NULL);
2771 
2772    return true;
2773 }
2774 
2775 bool
handleEXPORT(Instruction * i)2776 NVC0LoweringPass::handleEXPORT(Instruction *i)
2777 {
2778    if (prog->getType() == Program::TYPE_FRAGMENT) {
2779       int id = i->getSrc(0)->reg.data.offset / 4;
2780 
2781       if (i->src(0).isIndirect(0)) // TODO, ugly
2782          return false;
2783       i->op = OP_MOV;
2784       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2785       i->src(0).set(i->src(1));
2786       i->setSrc(1, NULL);
2787       i->setDef(0, new_LValue(func, FILE_GPR));
2788       i->getDef(0)->reg.data.id = id;
2789 
2790       prog->maxGPR = MAX2(prog->maxGPR, id);
2791    } else
2792    if (prog->getType() == Program::TYPE_GEOMETRY) {
2793       i->setIndirect(0, 1, gpEmitAddress);
2794    }
2795    return true;
2796 }
2797 
2798 bool
handleOUT(Instruction * i)2799 NVC0LoweringPass::handleOUT(Instruction *i)
2800 {
2801    Instruction *prev = i->prev;
2802    ImmediateValue stream, prevStream;
2803 
2804    // Only merge if the stream ids match. Also, note that the previous
2805    // instruction would have already been lowered, so we take arg1 from it.
2806    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2807        i->src(0).getImmediate(stream) &&
2808        prev->src(1).getImmediate(prevStream) &&
2809        stream.reg.data.u32 == prevStream.reg.data.u32) {
2810       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2811       delete_Instruction(prog, i);
2812    } else {
2813       assert(gpEmitAddress);
2814       i->setDef(0, gpEmitAddress);
2815       i->setSrc(1, i->getSrc(0));
2816       i->setSrc(0, gpEmitAddress);
2817    }
2818    return true;
2819 }
2820 
2821 // Generate a binary predicate if an instruction is predicated by
2822 // e.g. an f32 value.
2823 void
checkPredicate(Instruction * insn)2824 NVC0LoweringPass::checkPredicate(Instruction *insn)
2825 {
2826    Value *pred = insn->getPredicate();
2827    Value *pdst;
2828 
2829    if (!pred || pred->reg.file == FILE_PREDICATE)
2830       return;
2831    pdst = new_LValue(func, FILE_PREDICATE);
2832 
2833    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2834    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2835 
2836    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
2837 
2838    insn->setPredicate(insn->cc, pdst);
2839 }
2840 
2841 //
2842 // - add quadop dance for texturing
2843 // - put FP outputs in GPRs
2844 // - convert instruction sequences
2845 //
2846 bool
visit(Instruction * i)2847 NVC0LoweringPass::visit(Instruction *i)
2848 {
2849    bool ret = true;
2850    bld.setPosition(i, false);
2851 
2852    if (i->cc != CC_ALWAYS)
2853       checkPredicate(i);
2854 
2855    switch (i->op) {
2856    case OP_TEX:
2857    case OP_TXB:
2858    case OP_TXL:
2859    case OP_TXF:
2860    case OP_TXG:
2861       return handleTEX(i->asTex());
2862    case OP_TXD:
2863       return handleTXD(i->asTex());
2864    case OP_TXLQ:
2865       return handleTXLQ(i->asTex());
2866    case OP_TXQ:
2867      return handleTXQ(i->asTex());
2868    case OP_EX2:
2869       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2870       i->setSrc(0, i->getDef(0));
2871       break;
2872    case OP_POW:
2873       return handlePOW(i);
2874    case OP_DIV:
2875       return handleDIV(i);
2876    case OP_MOD:
2877       return handleMOD(i);
2878    case OP_SQRT:
2879       return handleSQRT(i);
2880    case OP_EXPORT:
2881       ret = handleEXPORT(i);
2882       break;
2883    case OP_EMIT:
2884    case OP_RESTART:
2885       return handleOUT(i);
2886    case OP_RDSV:
2887       return handleRDSV(i);
2888    case OP_WRSV:
2889       return handleWRSV(i);
2890    case OP_STORE:
2891    case OP_LOAD:
2892       handleLDST(i);
2893       break;
2894    case OP_ATOM:
2895    {
2896       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
2897       handleATOM(i);
2898       handleCasExch(i, cctl);
2899    }
2900       break;
2901    case OP_SULDB:
2902    case OP_SULDP:
2903    case OP_SUSTB:
2904    case OP_SUSTP:
2905    case OP_SUREDB:
2906    case OP_SUREDP:
2907       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
2908          handleSurfaceOpGM107(i->asTex());
2909       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
2910          handleSurfaceOpNVE4(i->asTex());
2911       else
2912          handleSurfaceOpNVC0(i->asTex());
2913       break;
2914    case OP_SUQ:
2915       handleSUQ(i->asTex());
2916       break;
2917    case OP_BUFQ:
2918       handleBUFQ(i);
2919       break;
2920    default:
2921       break;
2922    }
2923 
2924    /* Kepler+ has a special opcode to compute a new base address to be used
2925     * for indirect loads.
2926     *
2927     * Maxwell+ has an additional similar requirement for indirect
2928     * interpolation ops in frag shaders.
2929     */
2930    bool doAfetch = false;
2931    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2932        !i->perPatch &&
2933        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
2934        i->src(0).isIndirect(0)) {
2935       doAfetch = true;
2936    }
2937    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
2938        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
2939        i->src(0).isIndirect(0)) {
2940       doAfetch = true;
2941    }
2942 
2943    if (doAfetch) {
2944       Value *addr = cloneShallow(func, i->getSrc(0));
2945       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
2946                                       i->getSrc(0));
2947       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
2948       addr->reg.data.offset = 0;
2949       i->setSrc(0, addr);
2950       i->setIndirect(0, 0, afetch->getDef(0));
2951    }
2952 
2953    return ret;
2954 }
2955 
2956 bool
runLegalizePass(Program * prog,CGStage stage) const2957 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
2958 {
2959    if (stage == CG_STAGE_PRE_SSA) {
2960       NVC0LoweringPass pass(prog);
2961       return pass.run(prog, false, true);
2962    } else
2963    if (stage == CG_STAGE_POST_RA) {
2964       NVC0LegalizePostRA pass(prog);
2965       return pass.run(prog, false, true);
2966    } else
2967    if (stage == CG_STAGE_SSA) {
2968       NVC0LegalizeSSA pass;
2969       return pass.run(prog, false, true);
2970    }
2971    return false;
2972 }
2973 
2974 } // namespace nv50_ir
2975