1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25
26 #include "codegen/nv50_ir_target_nv50.h"
27
28 namespace nv50_ir {
29
30 // nv50 doesn't support 32 bit integer multiplication
31 //
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
36 // al*bl
37 // ah*bl 00
38 //
39 // fffe0001 + fffe0001
40 //
41 // Note that this sort of splitting doesn't work for signed values, so we
42 // compute the sign on those manually and then perform an unsigned multiply.
43 static bool
expandIntegerMUL(BuildUtil * bld,Instruction * mul)44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
45 {
46 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
47 ImmediateValue src1;
48 bool src1imm = mul->src(1).getImmediate(src1);
49
50 DataType fTy; // full type
51 switch (mul->sType) {
52 case TYPE_S32: fTy = TYPE_U32; break;
53 case TYPE_S64: fTy = TYPE_U64; break;
54 default: fTy = mul->sType; break;
55 }
56
57 DataType hTy; // half type
58 switch (fTy) {
59 case TYPE_U32: hTy = TYPE_U16; break;
60 case TYPE_U64: hTy = TYPE_U32; break;
61 default:
62 return false;
63 }
64 unsigned int fullSize = typeSizeof(fTy);
65 unsigned int halfSize = typeSizeof(hTy);
66
67 Instruction *i[9];
68
69 bld->setPosition(mul, true);
70
71 Value *s[2];
72 Value *a[2], *b[2];
73 Value *t[4];
74 for (int j = 0; j < 4; ++j)
75 t[j] = bld->getSSA(fullSize);
76
77 if (isSignedType(mul->sType) && highResult) {
78 s[0] = bld->getSSA(fullSize);
79 s[1] = bld->getSSA(fullSize);
80 bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
81 bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
82 src1.reg.data.s32 = abs(src1.reg.data.s32);
83 } else {
84 s[0] = mul->getSrc(0);
85 s[1] = mul->getSrc(1);
86 }
87
88 // split sources into halves
89 i[0] = bld->mkSplit(a, halfSize, s[0]);
90 i[1] = bld->mkSplit(b, halfSize, s[1]);
91
92 if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
93 i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
94 bld->mkImm(src1.reg.data.u32 & 0xffff));
95 } else {
96 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
97 src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
98 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
99 i[3] = i[2];
100 t[1] = t[0];
101 } else {
102 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
103 }
104 }
105 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
106 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
107 i[4] = i[3];
108 t[3] = t[2];
109 } else {
110 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
111 }
112
113 if (highResult) {
114 Value *c[2];
115 Value *r[5];
116 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
117 c[0] = bld->getSSA(1, FILE_FLAGS);
118 c[1] = bld->getSSA(1, FILE_FLAGS);
119 for (int j = 0; j < 5; ++j)
120 r[j] = bld->getSSA(fullSize);
121
122 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
123 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
124 bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
125 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
126 i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
127
128 // set carry defs / sources
129 i[3]->setFlagsDef(1, c[0]);
130 // actual result required in negative case, but ignored for
131 // unsigned. for some reason the compiler ends up dropping the whole
132 // instruction if the destination is unused but the flags are.
133 if (isSignedType(mul->sType))
134 i[4]->setFlagsDef(1, c[1]);
135 else
136 i[4]->setFlagsDef(0, c[1]);
137 i[6]->setPredicate(CC_C, c[0]);
138 i[5]->setFlagsSrc(3, c[1]);
139
140 if (isSignedType(mul->sType)) {
141 Value *cc[2];
142 Value *rr[7];
143 Value *one = bld->getSSA(fullSize);
144 bld->loadImm(one, 1);
145 for (int j = 0; j < 7; j++)
146 rr[j] = bld->getSSA(fullSize);
147
148 // NOTE: this logic uses predicates because splitting basic blocks is
149 // ~impossible during the SSA phase. The RA relies on a correlation
150 // between edge order and phi node sources.
151
152 // Set the sign of the result based on the inputs
153 bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
154 ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
155
156 // 1s complement of 64-bit value
157 bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
158 ->setPredicate(CC_S, cc[0]);
159 bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
160 ->setPredicate(CC_S, cc[0]);
161
162 // add to low 32-bits, keep track of the carry
163 Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
164 n->setPredicate(CC_S, cc[0]);
165 n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
166
167 // If there was a carry, add 1 to the upper 32 bits
168 // XXX: These get executed even if they shouldn't be
169 bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
170 ->setPredicate(CC_C, cc[1]);
171 bld->mkMov(rr[3], rr[0])
172 ->setPredicate(CC_NC, cc[1]);
173 bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
174
175 // Merge the results from the negative and non-negative paths
176 bld->mkMov(rr[5], rr[4])
177 ->setPredicate(CC_S, cc[0]);
178 bld->mkMov(rr[6], r[4])
179 ->setPredicate(CC_NS, cc[0]);
180 bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
181 } else {
182 bld->mkMov(mul->getDef(0), r[4]);
183 }
184 } else {
185 bld->mkMov(mul->getDef(0), t[3]);
186 }
187 delete_Instruction(bld->getProgram(), mul);
188
189 for (int j = 2; j <= (highResult ? 5 : 4); ++j)
190 if (i[j])
191 i[j]->sType = hTy;
192
193 return true;
194 }
195
196 #define QOP_ADD 0
197 #define QOP_SUBR 1
198 #define QOP_SUB 2
199 #define QOP_MOV2 3
200
201 // UL UR LL LR
202 #define QUADOP(q, r, s, t) \
203 ((QOP_##q << 6) | (QOP_##r << 4) | \
204 (QOP_##s << 2) | (QOP_##t << 0))
205
206 class NV50LegalizePostRA : public Pass
207 {
208 private:
209 virtual bool visit(Function *);
210 virtual bool visit(BasicBlock *);
211
212 void handlePRERET(FlowInstruction *);
213 void replaceZero(Instruction *);
214
215 LValue *r63;
216 };
217
218 bool
visit(Function * fn)219 NV50LegalizePostRA::visit(Function *fn)
220 {
221 Program *prog = fn->getProgram();
222
223 r63 = new_LValue(fn, FILE_GPR);
224 // GPR units on nv50 are in half-regs
225 if (prog->maxGPR < 126)
226 r63->reg.data.id = 63;
227 else
228 r63->reg.data.id = 127;
229
230 // this is actually per-program, but we can do it all on visiting main()
231 std::list<Instruction *> *outWrites =
232 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
233
234 if (outWrites) {
235 for (std::list<Instruction *>::iterator it = outWrites->begin();
236 it != outWrites->end(); ++it)
237 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
238 // instructions will be deleted on exit
239 outWrites->clear();
240 }
241
242 return true;
243 }
244
245 void
replaceZero(Instruction * i)246 NV50LegalizePostRA::replaceZero(Instruction *i)
247 {
248 for (int s = 0; i->srcExists(s); ++s) {
249 ImmediateValue *imm = i->getSrc(s)->asImm();
250 if (imm && imm->reg.data.u64 == 0)
251 i->setSrc(s, r63);
252 }
253 }
254
255 // Emulate PRERET: jump to the target and call to the origin from there
256 //
257 // WARNING: atm only works if BBs are affected by at most a single PRERET
258 //
259 // BB:0
260 // preret BB:3
261 // (...)
262 // BB:3
263 // (...)
264 // --->
265 // BB:0
266 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
267 // (...)
268 // BB:3
269 // bra BB:3 + n1 (skip the call)
270 // call BB:0 + n2 (skip bra at beginning of BB:0)
271 // (...)
272 void
handlePRERET(FlowInstruction * pre)273 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
274 {
275 BasicBlock *bbE = pre->bb;
276 BasicBlock *bbT = pre->target.bb;
277
278 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
279 bbE->remove(pre);
280 bbE->insertHead(pre);
281
282 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
283 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
284
285 bbT->insertHead(call);
286 bbT->insertHead(skip);
287
288 // NOTE: maybe split blocks to prevent the instructions from moving ?
289
290 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
291 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
292 }
293
294 bool
visit(BasicBlock * bb)295 NV50LegalizePostRA::visit(BasicBlock *bb)
296 {
297 Instruction *i, *next;
298
299 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
300 for (i = bb->getFirst(); i; i = next) {
301 next = i->next;
302 if (i->isNop()) {
303 bb->remove(i);
304 } else
305 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
306 handlePRERET(i->asFlow());
307 } else {
308 // TODO: We will want to do this before register allocation,
309 // since have to use a $c register for the carry flag.
310 if (typeSizeof(i->dType) == 8) {
311 Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
312 if (hi)
313 next = hi;
314 }
315
316 if (i->op != OP_PFETCH && i->op != OP_BAR &&
317 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
318 replaceZero(i);
319 }
320 }
321 if (!bb->getEntry())
322 return true;
323
324 return true;
325 }
326
327 class NV50LegalizeSSA : public Pass
328 {
329 public:
330 NV50LegalizeSSA(Program *);
331
332 virtual bool visit(BasicBlock *bb);
333
334 private:
335 void propagateWriteToOutput(Instruction *);
336 void handleDIV(Instruction *);
337 void handleMOD(Instruction *);
338 void handleMUL(Instruction *);
339 void handleAddrDef(Instruction *);
340
341 inline bool isARL(const Instruction *) const;
342
343 BuildUtil bld;
344
345 std::list<Instruction *> *outWrites;
346 };
347
NV50LegalizeSSA(Program * prog)348 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
349 {
350 bld.setProgram(prog);
351
352 if (prog->optLevel >= 2 &&
353 (prog->getType() == Program::TYPE_GEOMETRY ||
354 prog->getType() == Program::TYPE_VERTEX))
355 outWrites =
356 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
357 else
358 outWrites = NULL;
359 }
360
361 void
propagateWriteToOutput(Instruction * st)362 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
363 {
364 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
365 return;
366
367 // check def instruction can store
368 Instruction *di = st->getSrc(1)->defs.front()->getInsn();
369
370 // TODO: move exports (if beneficial) in common opt pass
371 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
372 return;
373
374 for (int s = 0; di->srcExists(s); ++s)
375 if (di->src(s).getFile() == FILE_IMMEDIATE ||
376 di->src(s).getFile() == FILE_MEMORY_LOCAL)
377 return;
378
379 if (prog->getType() == Program::TYPE_GEOMETRY) {
380 // Only propagate output writes in geometry shaders when we can be sure
381 // that we are propagating to the same output vertex.
382 if (di->bb != st->bb)
383 return;
384 Instruction *i;
385 for (i = di; i != st; i = i->next) {
386 if (i->op == OP_EMIT || i->op == OP_RESTART)
387 return;
388 }
389 assert(i); // st after di
390 }
391
392 // We cannot set defs to non-lvalues before register allocation, so
393 // save & remove (to save registers) the exports and replace later.
394 outWrites->push_back(st);
395 st->bb->remove(st);
396 }
397
398 bool
isARL(const Instruction * i) const399 NV50LegalizeSSA::isARL(const Instruction *i) const
400 {
401 ImmediateValue imm;
402
403 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
404 return false;
405 if (!i->src(1).getImmediate(imm))
406 return false;
407 return imm.isInteger(0);
408 }
409
410 void
handleAddrDef(Instruction * i)411 NV50LegalizeSSA::handleAddrDef(Instruction *i)
412 {
413 Instruction *arl;
414
415 i->getDef(0)->reg.size = 2; // $aX are only 16 bit
416
417 // PFETCH can always write to $a
418 if (i->op == OP_PFETCH)
419 return;
420 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
421 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
422 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
423 return;
424 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
425 return;
426 }
427
428 // turn $a sources into $r sources (can't operate on $a)
429 for (int s = 0; i->srcExists(s); ++s) {
430 Value *a = i->getSrc(s);
431 Value *r;
432 if (a->reg.file == FILE_ADDRESS) {
433 if (a->getInsn() && isARL(a->getInsn())) {
434 i->setSrc(s, a->getInsn()->getSrc(0));
435 } else {
436 bld.setPosition(i, false);
437 r = bld.getSSA();
438 bld.mkMov(r, a);
439 i->setSrc(s, r);
440 }
441 }
442 }
443 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
444 return;
445
446 // turn result back into $a
447 bld.setPosition(i, true);
448 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
449 i->setDef(0, arl->getSrc(0));
450 }
451
452 void
handleMUL(Instruction * mul)453 NV50LegalizeSSA::handleMUL(Instruction *mul)
454 {
455 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
456 return;
457 Value *def = mul->getDef(0);
458 Value *pred = mul->getPredicate();
459 CondCode cc = mul->cc;
460 if (pred)
461 mul->setPredicate(CC_ALWAYS, NULL);
462
463 if (mul->op == OP_MAD) {
464 Instruction *add = mul;
465 bld.setPosition(add, false);
466 Value *res = cloneShallow(func, mul->getDef(0));
467 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
468 add->op = OP_ADD;
469 add->setSrc(0, mul->getDef(0));
470 add->setSrc(1, add->getSrc(2));
471 for (int s = 2; add->srcExists(s); ++s)
472 add->setSrc(s, NULL);
473 mul->subOp = add->subOp;
474 add->subOp = 0;
475 }
476 expandIntegerMUL(&bld, mul);
477 if (pred)
478 def->getInsn()->setPredicate(cc, pred);
479 }
480
481 // Use f32 division: first compute an approximate result, use it to reduce
482 // the dividend, which should then be representable as f32, divide the reduced
483 // dividend, and add the quotients.
484 void
handleDIV(Instruction * div)485 NV50LegalizeSSA::handleDIV(Instruction *div)
486 {
487 const DataType ty = div->sType;
488
489 if (ty != TYPE_U32 && ty != TYPE_S32)
490 return;
491
492 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
493
494 bld.setPosition(div, false);
495
496 Value *a, *af = bld.getSSA();
497 Value *b, *bf = bld.getSSA();
498
499 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
500 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
501
502 if (isSignedType(ty)) {
503 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
504 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
505 a = bld.getSSA();
506 b = bld.getSSA();
507 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
508 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
509 } else {
510 a = div->getSrc(0);
511 b = div->getSrc(1);
512 }
513
514 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
515 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
516
517 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
518 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
519
520 // get error of 1st result
521 expandIntegerMUL(&bld,
522 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
523 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
524
525 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
526
527 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
528 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
529 ->rnd = ROUND_Z;
530 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
531
532 // correction: if modulus >= divisor, add 1
533 expandIntegerMUL(&bld,
534 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
535 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
536 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
537 if (!isSignedType(ty)) {
538 div->op = OP_SUB;
539 div->setSrc(0, q);
540 div->setSrc(1, s);
541 } else {
542 t = q;
543 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
544 s = bld.getSSA();
545 t = bld.getSSA();
546 // fix the sign
547 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
548 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
549 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
550 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
551
552 div->op = OP_UNION;
553 div->setSrc(0, s);
554 div->setSrc(1, t);
555 }
556 }
557
558 void
handleMOD(Instruction * mod)559 NV50LegalizeSSA::handleMOD(Instruction *mod)
560 {
561 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
562 return;
563 bld.setPosition(mod, false);
564
565 Value *q = bld.getSSA();
566 Value *m = bld.getSSA();
567
568 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
569 handleDIV(q->getInsn());
570
571 bld.setPosition(mod, false);
572 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
573
574 mod->op = OP_SUB;
575 mod->setSrc(1, m);
576 }
577
578 bool
visit(BasicBlock * bb)579 NV50LegalizeSSA::visit(BasicBlock *bb)
580 {
581 Instruction *insn, *next;
582 // skipping PHIs (don't pass them to handleAddrDef) !
583 for (insn = bb->getEntry(); insn; insn = next) {
584 next = insn->next;
585
586 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
587 handleAddrDef(insn);
588
589 switch (insn->op) {
590 case OP_EXPORT:
591 if (outWrites)
592 propagateWriteToOutput(insn);
593 break;
594 case OP_DIV:
595 handleDIV(insn);
596 break;
597 case OP_MOD:
598 handleMOD(insn);
599 break;
600 case OP_MAD:
601 case OP_MUL:
602 handleMUL(insn);
603 break;
604 default:
605 break;
606 }
607 }
608 return true;
609 }
610
611 class NV50LoweringPreSSA : public Pass
612 {
613 public:
614 NV50LoweringPreSSA(Program *);
615
616 private:
617 virtual bool visit(Instruction *);
618 virtual bool visit(Function *);
619
620 bool handleRDSV(Instruction *);
621 bool handleWRSV(Instruction *);
622
623 bool handlePFETCH(Instruction *);
624 bool handleEXPORT(Instruction *);
625 bool handleLOAD(Instruction *);
626
627 bool handleDIV(Instruction *);
628 bool handleSQRT(Instruction *);
629 bool handlePOW(Instruction *);
630
631 bool handleSET(Instruction *);
632 bool handleSLCT(CmpInstruction *);
633 bool handleSELP(Instruction *);
634
635 bool handleTEX(TexInstruction *);
636 bool handleTXB(TexInstruction *); // I really
637 bool handleTXL(TexInstruction *); // hate
638 bool handleTXD(TexInstruction *); // these 3
639 bool handleTXLQ(TexInstruction *);
640 bool handleTXQ(TexInstruction *);
641
642 bool handleCALL(Instruction *);
643 bool handlePRECONT(Instruction *);
644 bool handleCONT(Instruction *);
645
646 void checkPredicate(Instruction *);
647 void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
648 void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
649
650 private:
651 const Target *const targ;
652
653 BuildUtil bld;
654
655 Value *tid;
656 };
657
NV50LoweringPreSSA(Program * prog)658 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
659 targ(prog->getTarget()), tid(NULL)
660 {
661 bld.setProgram(prog);
662 }
663
664 bool
visit(Function * f)665 NV50LoweringPreSSA::visit(Function *f)
666 {
667 BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
668
669 if (prog->getType() == Program::TYPE_COMPUTE) {
670 // Add implicit "thread id" argument in $r0 to the function
671 Value *arg = new_LValue(func, FILE_GPR);
672 arg->reg.data.id = 0;
673 f->ins.push_back(arg);
674
675 bld.setPosition(root, false);
676 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
677 }
678
679 return true;
680 }
681
loadTexMsInfo(uint32_t off,Value ** ms,Value ** ms_x,Value ** ms_y)682 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
683 Value **ms_x, Value **ms_y) {
684 // This loads the texture-indexed ms setting from the constant buffer
685 Value *tmp = new_LValue(func, FILE_GPR);
686 uint8_t b = prog->driver->io.auxCBSlot;
687 off += prog->driver->io.suInfoBase;
688 if (prog->getType() > Program::TYPE_VERTEX)
689 off += 16 * 2 * 4;
690 if (prog->getType() > Program::TYPE_GEOMETRY)
691 off += 16 * 2 * 4;
692 *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
693 FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
694 *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
695 FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
696 *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
697 }
698
loadMsInfo(Value * ms,Value * s,Value ** dx,Value ** dy)699 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
700 // Given a MS level, and a sample id, compute the delta x/y
701 uint8_t b = prog->driver->io.msInfoCBSlot;
702 Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
703
704 // The required information is at mslevel * 16 * 4 + sample * 8
705 // = (mslevel * 8 + sample) * 8
706 bld.mkOp2(OP_SHL,
707 TYPE_U32,
708 off,
709 bld.mkOp2v(OP_ADD, TYPE_U32, t,
710 bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
711 s),
712 bld.mkImm(3));
713 *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
714 FILE_MEMORY_CONST, b, TYPE_U32,
715 prog->driver->io.msInfoBase), off);
716 *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
717 FILE_MEMORY_CONST, b, TYPE_U32,
718 prog->driver->io.msInfoBase + 4), off);
719 }
720
721 bool
handleTEX(TexInstruction * i)722 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
723 {
724 const int arg = i->tex.target.getArgCount();
725 const int dref = arg;
726 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
727
728 /* Only normalize in the non-explicit derivatives case.
729 */
730 if (i->tex.target.isCube() && i->op != OP_TXD) {
731 Value *src[3], *val;
732 int c;
733 for (c = 0; c < 3; ++c)
734 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
735 val = bld.getScratch();
736 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
737 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
738 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
739 for (c = 0; c < 3; ++c) {
740 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
741 i->getSrc(c), val));
742 }
743 }
744
745 // handle MS, which means looking up the MS params for this texture, and
746 // adjusting the input coordinates to point at the right sample.
747 if (i->tex.target.isMS()) {
748 Value *x = i->getSrc(0);
749 Value *y = i->getSrc(1);
750 Value *s = i->getSrc(arg - 1);
751 Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
752 *ms, *ms_x, *ms_y, *dx, *dy;
753
754 i->tex.target.clearMS();
755
756 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
757 loadMsInfo(ms, s, &dx, &dy);
758
759 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
760 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
761 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
762 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
763 i->setSrc(0, tx);
764 i->setSrc(1, ty);
765 i->setSrc(arg - 1, bld.loadImm(NULL, 0));
766 }
767
768 // dref comes before bias/lod
769 if (i->tex.target.isShadow())
770 if (i->op == OP_TXB || i->op == OP_TXL)
771 i->swapSources(dref, lod);
772
773 if (i->tex.target.isArray()) {
774 if (i->op != OP_TXF) {
775 // array index must be converted to u32, but it's already an integer
776 // for TXF
777 Value *layer = i->getSrc(arg - 1);
778 LValue *src = new_LValue(func, FILE_GPR);
779 bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
780 bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
781 i->setSrc(arg - 1, src);
782 }
783 if (i->tex.target.isCube() && i->srcCount() > 4) {
784 std::vector<Value *> acube, a2d;
785 int c;
786
787 acube.resize(4);
788 for (c = 0; c < 4; ++c)
789 acube[c] = i->getSrc(c);
790 a2d.resize(4);
791 for (c = 0; c < 3; ++c)
792 a2d[c] = new_LValue(func, FILE_GPR);
793 a2d[3] = NULL;
794
795 bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
796 a2d, acube)->asTex()->tex.mask = 0x7;
797
798 for (c = 0; c < 3; ++c)
799 i->setSrc(c, a2d[c]);
800 for (; i->srcExists(c + 1); ++c)
801 i->setSrc(c, i->getSrc(c + 1));
802 i->setSrc(c, NULL);
803 assert(c <= 4);
804
805 i->tex.target = i->tex.target.isShadow() ?
806 TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
807 }
808 }
809
810 // texel offsets are 3 immediate fields in the instruction,
811 // nv50 cannot do textureGatherOffsets
812 assert(i->tex.useOffsets <= 1);
813 if (i->tex.useOffsets) {
814 for (int c = 0; c < 3; ++c) {
815 ImmediateValue val;
816 if (!i->offset[0][c].getImmediate(val))
817 assert(!"non-immediate offset");
818 i->tex.offset[c] = val.reg.data.u32;
819 i->offset[0][c].set(NULL);
820 }
821 }
822
823 return true;
824 }
825
826 // Bias must be equal for all threads of a quad or lod calculation will fail.
827 //
828 // The lanes of a quad are grouped by the bit in the condition register they
829 // have set, which is selected by differing bias values.
830 // Move the input values for TEX into a new register set for each group and
831 // execute TEX only for a specific group.
832 // We always need to use 4 new registers for the inputs/outputs because the
833 // implicitly calculated derivatives must be correct.
834 //
835 // TODO: move to SSA phase so we can easily determine whether bias is constant
836 bool
handleTXB(TexInstruction * i)837 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
838 {
839 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
840 int l, d;
841
842 // We can't actually apply bias *and* do a compare for a cube
843 // texture. Since the compare has to be done before the filtering, just
844 // drop the bias on the floor.
845 if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
846 i->op = OP_TEX;
847 i->setSrc(3, i->getSrc(4));
848 i->setSrc(4, NULL);
849 return handleTEX(i);
850 }
851
852 handleTEX(i);
853 Value *bias = i->getSrc(i->tex.target.getArgCount());
854 if (bias->isUniform())
855 return true;
856
857 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
858 bld.loadImm(NULL, 1));
859 bld.setPosition(cond, false);
860
861 for (l = 1; l < 4; ++l) {
862 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
863 Value *bit = bld.getSSA();
864 Value *pred = bld.getScratch(1, FILE_FLAGS);
865 Value *imm = bld.loadImm(NULL, (1 << l));
866 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
867 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
868 cond->setSrc(l, bit);
869 }
870 Value *flags = bld.getScratch(1, FILE_FLAGS);
871 bld.setPosition(cond, true);
872 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
873
874 Instruction *tex[4];
875 for (l = 0; l < 4; ++l) {
876 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
877 bld.insert(tex[l]);
878 }
879
880 Value *res[4][4];
881 for (d = 0; i->defExists(d); ++d)
882 res[0][d] = tex[0]->getDef(d);
883 for (l = 1; l < 4; ++l) {
884 for (d = 0; tex[l]->defExists(d); ++d) {
885 res[l][d] = cloneShallow(func, res[0][d]);
886 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
887 }
888 }
889
890 for (d = 0; i->defExists(d); ++d) {
891 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
892 for (l = 0; l < 4; ++l)
893 dst->setSrc(l, res[l][d]);
894 }
895 delete_Instruction(prog, i);
896 return true;
897 }
898
899 // LOD must be equal for all threads of a quad.
900 // Unlike with TXB, here we can just diverge since there's no LOD calculation
901 // that would require all 4 threads' sources to be set up properly.
902 bool
handleTXL(TexInstruction * i)903 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
904 {
905 handleTEX(i);
906 Value *lod = i->getSrc(i->tex.target.getArgCount());
907 if (lod->isUniform())
908 return true;
909
910 BasicBlock *currBB = i->bb;
911 BasicBlock *texiBB = i->bb->splitBefore(i, false);
912 BasicBlock *joinBB = i->bb->splitAfter(i);
913
914 bld.setPosition(currBB, true);
915 assert(!currBB->joinAt);
916 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
917
918 for (int l = 0; l <= 3; ++l) {
919 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
920 Value *pred = bld.getScratch(1, FILE_FLAGS);
921 bld.setPosition(currBB, true);
922 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
923 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
924 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
925 if (l <= 2) {
926 BasicBlock *laneBB = new BasicBlock(func);
927 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
928 currBB = laneBB;
929 }
930 }
931 bld.setPosition(joinBB, false);
932 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
933 return true;
934 }
935
936 bool
handleTXD(TexInstruction * i)937 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
938 {
939 static const uint8_t qOps[4][2] =
940 {
941 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
942 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
943 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
944 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
945 };
946 Value *def[4][4];
947 Value *crd[3];
948 Instruction *tex;
949 Value *zero = bld.loadImm(bld.getSSA(), 0);
950 int l, c;
951 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
952
953 handleTEX(i);
954 i->op = OP_TEX; // no need to clone dPdx/dPdy later
955 i->tex.derivAll = true;
956
957 for (c = 0; c < dim; ++c)
958 crd[c] = bld.getScratch();
959
960 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
961 for (l = 0; l < 4; ++l) {
962 Value *src[3], *val;
963 // mov coordinates from lane l to all lanes
964 for (c = 0; c < dim; ++c)
965 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
966 // add dPdx from lane l to lanes dx
967 for (c = 0; c < dim; ++c)
968 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
969 // add dPdy from lane l to lanes dy
970 for (c = 0; c < dim; ++c)
971 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
972 // normalize cube coordinates if necessary
973 if (i->tex.target.isCube()) {
974 for (c = 0; c < 3; ++c)
975 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
976 val = bld.getScratch();
977 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
978 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
979 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
980 for (c = 0; c < 3; ++c)
981 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
982 } else {
983 for (c = 0; c < dim; ++c)
984 src[c] = crd[c];
985 }
986 // texture
987 bld.insert(tex = cloneForward(func, i));
988 for (c = 0; c < dim; ++c)
989 tex->setSrc(c, src[c]);
990 // save results
991 for (c = 0; i->defExists(c); ++c) {
992 Instruction *mov;
993 def[c][l] = bld.getSSA();
994 mov = bld.mkMov(def[c][l], tex->getDef(c));
995 mov->fixed = 1;
996 mov->lanes = 1 << l;
997 }
998 }
999 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1000
1001 for (c = 0; i->defExists(c); ++c) {
1002 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1003 for (l = 0; l < 4; ++l)
1004 u->setSrc(l, def[c][l]);
1005 }
1006
1007 i->bb->remove(i);
1008 return true;
1009 }
1010
1011 bool
handleTXLQ(TexInstruction * i)1012 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1013 {
1014 handleTEX(i);
1015 bld.setPosition(i, true);
1016
1017 /* The returned values are not quite what we want:
1018 * (a) convert from s32 to f32
1019 * (b) multiply by 1/256
1020 */
1021 for (int def = 0; def < 2; ++def) {
1022 if (!i->defExists(def))
1023 continue;
1024 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1025 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1026 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1027 }
1028 return true;
1029 }
1030
1031 bool
handleTXQ(TexInstruction * i)1032 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1033 {
1034 Value *ms, *ms_x, *ms_y;
1035 if (i->tex.query == TXQ_DIMS)
1036 return true;
1037 assert(i->tex.query == TXQ_TYPE);
1038 assert(i->tex.mask == 4);
1039
1040 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1041 bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1042 i->bb->remove(i);
1043
1044 return true;
1045 }
1046
1047
1048 bool
handleSET(Instruction * i)1049 NV50LoweringPreSSA::handleSET(Instruction *i)
1050 {
1051 if (i->dType == TYPE_F32) {
1052 bld.setPosition(i, true);
1053 i->dType = TYPE_U32;
1054 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1055 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1056 }
1057 return true;
1058 }
1059
1060 bool
handleSLCT(CmpInstruction * i)1061 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1062 {
1063 Value *src0 = bld.getSSA();
1064 Value *src1 = bld.getSSA();
1065 Value *pred = bld.getScratch(1, FILE_FLAGS);
1066
1067 Value *v0 = i->getSrc(0);
1068 Value *v1 = i->getSrc(1);
1069 // XXX: these probably shouldn't be immediates in the first place ...
1070 if (v0->asImm())
1071 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1072 if (v1->asImm())
1073 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1074
1075 bld.setPosition(i, true);
1076 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1077 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1078 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1079
1080 bld.setPosition(i, false);
1081 i->op = OP_SET;
1082 i->setFlagsDef(0, pred);
1083 i->dType = TYPE_U8;
1084 i->setSrc(0, i->getSrc(2));
1085 i->setSrc(2, NULL);
1086 i->setSrc(1, bld.loadImm(NULL, 0));
1087
1088 return true;
1089 }
1090
1091 bool
handleSELP(Instruction * i)1092 NV50LoweringPreSSA::handleSELP(Instruction *i)
1093 {
1094 Value *src0 = bld.getSSA();
1095 Value *src1 = bld.getSSA();
1096
1097 Value *v0 = i->getSrc(0);
1098 Value *v1 = i->getSrc(1);
1099 if (v0->asImm())
1100 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1101 if (v1->asImm())
1102 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1103
1104 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1105 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1106 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1107 delete_Instruction(prog, i);
1108 return true;
1109 }
1110
1111 bool
handleWRSV(Instruction * i)1112 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1113 {
1114 Symbol *sym = i->getSrc(0)->asSym();
1115
1116 // these are all shader outputs, $sreg are not writeable
1117 uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1118 if (addr >= 0x400)
1119 return false;
1120 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1121
1122 bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1123
1124 bld.getBB()->remove(i);
1125 return true;
1126 }
1127
1128 bool
handleCALL(Instruction * i)1129 NV50LoweringPreSSA::handleCALL(Instruction *i)
1130 {
1131 if (prog->getType() == Program::TYPE_COMPUTE) {
1132 // Add implicit "thread id" argument in $r0 to the function
1133 i->setSrc(i->srcCount(), tid);
1134 }
1135 return true;
1136 }
1137
1138 bool
handlePRECONT(Instruction * i)1139 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1140 {
1141 delete_Instruction(prog, i);
1142 return true;
1143 }
1144
1145 bool
handleCONT(Instruction * i)1146 NV50LoweringPreSSA::handleCONT(Instruction *i)
1147 {
1148 i->op = OP_BRA;
1149 return true;
1150 }
1151
1152 bool
handleRDSV(Instruction * i)1153 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1154 {
1155 Symbol *sym = i->getSrc(0)->asSym();
1156 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1157 Value *def = i->getDef(0);
1158 SVSemantic sv = sym->reg.data.sv.sv;
1159 int idx = sym->reg.data.sv.index;
1160
1161 if (addr >= 0x400) // mov $sreg
1162 return true;
1163
1164 switch (sv) {
1165 case SV_POSITION:
1166 assert(prog->getType() == Program::TYPE_FRAGMENT);
1167 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1168 break;
1169 case SV_FACE:
1170 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1171 if (i->dType == TYPE_F32) {
1172 bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1173 bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1174 bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1175 }
1176 break;
1177 case SV_NCTAID:
1178 case SV_CTAID:
1179 case SV_NTID:
1180 if ((sv == SV_NCTAID && idx >= 2) ||
1181 (sv == SV_NTID && idx >= 3)) {
1182 bld.mkMov(def, bld.mkImm(1));
1183 } else if (sv == SV_CTAID && idx >= 2) {
1184 bld.mkMov(def, bld.mkImm(0));
1185 } else {
1186 Value *x = bld.getSSA(2);
1187 bld.mkOp1(OP_LOAD, TYPE_U16, x,
1188 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1189 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1190 }
1191 break;
1192 case SV_TID:
1193 if (idx == 0) {
1194 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1195 } else if (idx == 1) {
1196 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1197 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1198 } else if (idx == 2) {
1199 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1200 } else {
1201 bld.mkMov(def, bld.mkImm(0));
1202 }
1203 break;
1204 case SV_SAMPLE_POS: {
1205 Value *off = new_LValue(func, FILE_ADDRESS);
1206 bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1207 bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1208 bld.mkLoad(TYPE_F32,
1209 def,
1210 bld.mkSymbol(
1211 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1212 TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1213 off);
1214 break;
1215 }
1216 default:
1217 bld.mkFetch(i->getDef(0), i->dType,
1218 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1219 break;
1220 }
1221 bld.getBB()->remove(i);
1222 return true;
1223 }
1224
1225 bool
handleDIV(Instruction * i)1226 NV50LoweringPreSSA::handleDIV(Instruction *i)
1227 {
1228 if (!isFloatType(i->dType))
1229 return true;
1230 bld.setPosition(i, false);
1231 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1232 i->op = OP_MUL;
1233 i->setSrc(1, rcp->getDef(0));
1234 return true;
1235 }
1236
1237 bool
handleSQRT(Instruction * i)1238 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1239 {
1240 bld.setPosition(i, true);
1241 i->op = OP_RSQ;
1242 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1243
1244 return true;
1245 }
1246
1247 bool
handlePOW(Instruction * i)1248 NV50LoweringPreSSA::handlePOW(Instruction *i)
1249 {
1250 LValue *val = bld.getScratch();
1251
1252 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1253 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1254 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1255
1256 i->op = OP_EX2;
1257 i->setSrc(0, val);
1258 i->setSrc(1, NULL);
1259
1260 return true;
1261 }
1262
1263 bool
handleEXPORT(Instruction * i)1264 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1265 {
1266 if (prog->getType() == Program::TYPE_FRAGMENT) {
1267 if (i->getIndirect(0, 0)) {
1268 // TODO: redirect to l[] here, load to GPRs at exit
1269 return false;
1270 } else {
1271 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1272
1273 i->op = OP_MOV;
1274 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1275 i->src(0).set(i->src(1));
1276 i->setSrc(1, NULL);
1277 i->setDef(0, new_LValue(func, FILE_GPR));
1278 i->getDef(0)->reg.data.id = id;
1279
1280 prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1281 }
1282 }
1283 return true;
1284 }
1285
1286 // Handle indirect addressing in geometry shaders:
1287 //
1288 // ld $r0 a[$a1][$a2+k] ->
1289 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1290 //
1291 bool
handleLOAD(Instruction * i)1292 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1293 {
1294 ValueRef src = i->src(0);
1295
1296 if (src.isIndirect(1)) {
1297 assert(prog->getType() == Program::TYPE_GEOMETRY);
1298 Value *addr = i->getIndirect(0, 1);
1299
1300 if (src.isIndirect(0)) {
1301 // base address is in an address register, so move to a GPR
1302 Value *base = bld.getScratch();
1303 bld.mkMov(base, addr);
1304
1305 Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1306 Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1307 Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1308 i->getIndirect(0, 0), bld.mkImm(2));
1309
1310 // Calculate final address: addr = base + attr*vstride; use 16-bit
1311 // multiplication since 32-bit would be lowered to multiple
1312 // instructions, and we only need the low 16 bits of the result
1313 Value *a[2], *b[2];
1314 bld.mkSplit(a, 2, attrib);
1315 bld.mkSplit(b, 2, vstride);
1316 Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1317 base);
1318
1319 // move address from GPR into an address register
1320 addr = bld.getSSA(2, FILE_ADDRESS);
1321 bld.mkMov(addr, sum);
1322 }
1323
1324 i->setIndirect(0, 1, NULL);
1325 i->setIndirect(0, 0, addr);
1326 }
1327
1328 return true;
1329 }
1330
1331 bool
handlePFETCH(Instruction * i)1332 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1333 {
1334 assert(prog->getType() == Program::TYPE_GEOMETRY);
1335
1336 // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1337 // later phase if that assertion ever triggers:
1338
1339 ImmediateValue *imm = i->getSrc(0)->asImm();
1340 assert(imm);
1341
1342 assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1343
1344 if (i->srcExists(1)) {
1345 // indirect addressing of vertex in primitive space
1346
1347 LValue *val = bld.getScratch();
1348 Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1349 bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1350 bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1351
1352 // NOTE: PFETCH directly to an $aX only works with direct addressing
1353 i->op = OP_SHL;
1354 i->setSrc(0, val);
1355 i->setSrc(1, bld.mkImm(0));
1356 }
1357
1358 return true;
1359 }
1360
1361 // Set flags according to predicate and make the instruction read $cX.
1362 void
checkPredicate(Instruction * insn)1363 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1364 {
1365 Value *pred = insn->getPredicate();
1366 Value *cdst;
1367
1368 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1369 if (!pred ||
1370 pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1371 return;
1372
1373 cdst = bld.getSSA(1, FILE_FLAGS);
1374
1375 bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1376
1377 insn->setPredicate(insn->cc, cdst);
1378 }
1379
1380 //
1381 // - add quadop dance for texturing
1382 // - put FP outputs in GPRs
1383 // - convert instruction sequences
1384 //
1385 bool
visit(Instruction * i)1386 NV50LoweringPreSSA::visit(Instruction *i)
1387 {
1388 bld.setPosition(i, false);
1389
1390 if (i->cc != CC_ALWAYS)
1391 checkPredicate(i);
1392
1393 switch (i->op) {
1394 case OP_TEX:
1395 case OP_TXF:
1396 case OP_TXG:
1397 return handleTEX(i->asTex());
1398 case OP_TXB:
1399 return handleTXB(i->asTex());
1400 case OP_TXL:
1401 return handleTXL(i->asTex());
1402 case OP_TXD:
1403 return handleTXD(i->asTex());
1404 case OP_TXLQ:
1405 return handleTXLQ(i->asTex());
1406 case OP_TXQ:
1407 return handleTXQ(i->asTex());
1408 case OP_EX2:
1409 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1410 i->setSrc(0, i->getDef(0));
1411 break;
1412 case OP_SET:
1413 return handleSET(i);
1414 case OP_SLCT:
1415 return handleSLCT(i->asCmp());
1416 case OP_SELP:
1417 return handleSELP(i);
1418 case OP_POW:
1419 return handlePOW(i);
1420 case OP_DIV:
1421 return handleDIV(i);
1422 case OP_SQRT:
1423 return handleSQRT(i);
1424 case OP_EXPORT:
1425 return handleEXPORT(i);
1426 case OP_LOAD:
1427 return handleLOAD(i);
1428 case OP_RDSV:
1429 return handleRDSV(i);
1430 case OP_WRSV:
1431 return handleWRSV(i);
1432 case OP_CALL:
1433 return handleCALL(i);
1434 case OP_PRECONT:
1435 return handlePRECONT(i);
1436 case OP_CONT:
1437 return handleCONT(i);
1438 case OP_PFETCH:
1439 return handlePFETCH(i);
1440 default:
1441 break;
1442 }
1443 return true;
1444 }
1445
1446 bool
runLegalizePass(Program * prog,CGStage stage) const1447 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1448 {
1449 bool ret = false;
1450
1451 if (stage == CG_STAGE_PRE_SSA) {
1452 NV50LoweringPreSSA pass(prog);
1453 ret = pass.run(prog, false, true);
1454 } else
1455 if (stage == CG_STAGE_SSA) {
1456 if (!prog->targetPriv)
1457 prog->targetPriv = new std::list<Instruction *>();
1458 NV50LegalizeSSA pass(prog);
1459 ret = pass.run(prog, false, true);
1460 } else
1461 if (stage == CG_STAGE_POST_RA) {
1462 NV50LegalizePostRA pass;
1463 ret = pass.run(prog, false, true);
1464 if (prog->targetPriv)
1465 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1466 }
1467 return ret;
1468 }
1469
1470 } // namespace nv50_ir
1471