1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
23 #include "nv50_ir_target_nvc0.h"
24 
25 namespace nv50_ir {
26 
27 // Argh, all these assertions ...
28 
29 class CodeEmitterNVC0 : public CodeEmitter
30 {
31 public:
32    CodeEmitterNVC0(const TargetNVC0 *);
33 
34    virtual bool emitInstruction(Instruction *);
35    virtual uint32_t getMinEncodingSize(const Instruction *) const;
36    virtual void prepareEmission(Function *);
37 
setProgramType(Program::Type pType)38    inline void setProgramType(Program::Type pType) { progType = pType; }
39 
40 private:
41    const TargetNVC0 *targ;
42 
43    Program::Type progType;
44 
45    const bool writeIssueDelays;
46 
47 private:
48    void emitForm_A(const Instruction *, uint64_t);
49    void emitForm_B(const Instruction *, uint64_t);
50    void emitForm_S(const Instruction *, uint32_t, bool pred);
51 
52    void emitPredicate(const Instruction *);
53 
54    void setAddress16(const ValueRef&);
55    void setImmediate(const Instruction *, const int s); // needs op already set
56    void setImmediateS8(const ValueRef&);
57 
58    void emitCondCode(CondCode cc, int pos);
59    void emitInterpMode(const Instruction *);
60    void emitLoadStoreType(DataType ty);
61    void emitCachingMode(CacheMode c);
62 
63    void emitShortSrc2(const ValueRef&);
64 
65    inline uint8_t getSRegEncoding(const ValueRef&);
66 
67    void roundMode_A(const Instruction *);
68    void roundMode_C(const Instruction *);
69    void roundMode_CS(const Instruction *);
70 
71    void emitNegAbs12(const Instruction *);
72 
73    void emitNOP(const Instruction *);
74 
75    void emitLOAD(const Instruction *);
76    void emitSTORE(const Instruction *);
77    void emitMOV(const Instruction *);
78 
79    void emitINTERP(const Instruction *);
80    void emitPFETCH(const Instruction *);
81    void emitVFETCH(const Instruction *);
82    void emitEXPORT(const Instruction *);
83    void emitOUT(const Instruction *);
84 
85    void emitUADD(const Instruction *);
86    void emitFADD(const Instruction *);
87    void emitUMUL(const Instruction *);
88    void emitFMUL(const Instruction *);
89    void emitIMAD(const Instruction *);
90    void emitISAD(const Instruction *);
91    void emitFMAD(const Instruction *);
92 
93    void emitNOT(Instruction *);
94    void emitLogicOp(const Instruction *, uint8_t subOp);
95    void emitPOPC(const Instruction *);
96    void emitINSBF(const Instruction *);
97    void emitShift(const Instruction *);
98 
99    void emitSFnOp(const Instruction *, uint8_t subOp);
100 
101    void emitCVT(Instruction *);
102    void emitMINMAX(const Instruction *);
103    void emitPreOp(const Instruction *);
104 
105    void emitSET(const CmpInstruction *);
106    void emitSLCT(const CmpInstruction *);
107    void emitSELP(const Instruction *);
108 
109    void emitTEXBAR(const Instruction *);
110    void emitTEX(const TexInstruction *);
111    void emitTEXCSAA(const TexInstruction *);
112    void emitTXQ(const TexInstruction *);
113    void emitPIXLD(const TexInstruction *);
114 
115    void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
116 
117    void emitFlow(const Instruction *);
118 
119    inline void defId(const ValueDef&, const int pos);
120    inline void srcId(const ValueRef&, const int pos);
121    inline void srcId(const ValueRef *, const int pos);
122    inline void srcId(const Instruction *, int s, const int pos);
123 
124    inline void srcAddr32(const ValueRef&, const int pos); // address / 4
125 
126    inline bool isLIMM(const ValueRef&, DataType ty);
127 };
128 
129 // for better visibility
130 #define HEX64(h, l) 0x##h##l##ULL
131 
132 #define SDATA(a) ((a).rep()->reg.data)
133 #define DDATA(a) ((a).rep()->reg.data)
134 
srcId(const ValueRef & src,const int pos)135 void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos)
136 {
137    code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32);
138 }
139 
srcId(const ValueRef * src,const int pos)140 void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos)
141 {
142    code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32);
143 }
144 
srcId(const Instruction * insn,int s,int pos)145 void CodeEmitterNVC0::srcId(const Instruction *insn, int s, int pos)
146 {
147    int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : 63;
148    code[pos / 32] |= r << (pos % 32);
149 }
150 
srcAddr32(const ValueRef & src,const int pos)151 void CodeEmitterNVC0::srcAddr32(const ValueRef& src, const int pos)
152 {
153    code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32);
154 }
155 
defId(const ValueDef & def,const int pos)156 void CodeEmitterNVC0::defId(const ValueDef& def, const int pos)
157 {
158    code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32);
159 }
160 
isLIMM(const ValueRef & ref,DataType ty)161 bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty)
162 {
163    const ImmediateValue *imm = ref.get()->asImm();
164 
165    return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
166 }
167 
168 void
roundMode_A(const Instruction * insn)169 CodeEmitterNVC0::roundMode_A(const Instruction *insn)
170 {
171    switch (insn->rnd) {
172    case ROUND_M: code[1] |= 1 << 23; break;
173    case ROUND_P: code[1] |= 2 << 23; break;
174    case ROUND_Z: code[1] |= 3 << 23; break;
175    default:
176       assert(insn->rnd == ROUND_N);
177       break;
178    }
179 }
180 
181 void
emitNegAbs12(const Instruction * i)182 CodeEmitterNVC0::emitNegAbs12(const Instruction *i)
183 {
184    if (i->src(1).mod.abs()) code[0] |= 1 << 6;
185    if (i->src(0).mod.abs()) code[0] |= 1 << 7;
186    if (i->src(1).mod.neg()) code[0] |= 1 << 8;
187    if (i->src(0).mod.neg()) code[0] |= 1 << 9;
188 }
189 
emitCondCode(CondCode cc,int pos)190 void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos)
191 {
192    uint8_t val;
193 
194    switch (cc) {
195    case CC_LT:  val = 0x1; break;
196    case CC_LTU: val = 0x9; break;
197    case CC_EQ:  val = 0x2; break;
198    case CC_EQU: val = 0xa; break;
199    case CC_LE:  val = 0x3; break;
200    case CC_LEU: val = 0xb; break;
201    case CC_GT:  val = 0x4; break;
202    case CC_GTU: val = 0xc; break;
203    case CC_NE:  val = 0x5; break;
204    case CC_NEU: val = 0xd; break;
205    case CC_GE:  val = 0x6; break;
206    case CC_GEU: val = 0xe; break;
207    case CC_TR:  val = 0xf; break;
208    case CC_FL:  val = 0x0; break;
209 
210    case CC_A:  val = 0x14; break;
211    case CC_NA: val = 0x13; break;
212    case CC_S:  val = 0x15; break;
213    case CC_NS: val = 0x12; break;
214    case CC_C:  val = 0x16; break;
215    case CC_NC: val = 0x11; break;
216    case CC_O:  val = 0x17; break;
217    case CC_NO: val = 0x10; break;
218 
219    default:
220       val = 0;
221       assert(!"invalid condition code");
222       break;
223    }
224    code[pos / 32] |= val << (pos % 32);
225 }
226 
227 void
emitPredicate(const Instruction * i)228 CodeEmitterNVC0::emitPredicate(const Instruction *i)
229 {
230    if (i->predSrc >= 0) {
231       assert(i->getPredicate()->reg.file == FILE_PREDICATE);
232       srcId(i->src(i->predSrc), 10);
233       if (i->cc == CC_NOT_P)
234          code[0] |= 0x2000; // negate
235    } else {
236       code[0] |= 0x1c00;
237    }
238 }
239 
240 void
setAddress16(const ValueRef & src)241 CodeEmitterNVC0::setAddress16(const ValueRef& src)
242 {
243    Symbol *sym = src.get()->asSym();
244 
245    assert(sym);
246 
247    code[0] |= (sym->reg.data.offset & 0x003f) << 26;
248    code[1] |= (sym->reg.data.offset & 0xffc0) >> 6;
249 }
250 
251 void
setImmediate(const Instruction * i,const int s)252 CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
253 {
254    const ImmediateValue *imm = i->src(s).get()->asImm();
255    uint32_t u32;
256 
257    assert(imm);
258    u32 = imm->reg.data.u32;
259 
260    if ((code[0] & 0xf) == 0x2) {
261       // LIMM
262       code[0] |= (u32 & 0x3f) << 26;
263       code[1] |= u32 >> 6;
264    } else
265    if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
266       // integer immediate
267       assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
268       assert(!(code[1] & 0xc000));
269       u32 &= 0xfffff;
270       code[0] |= (u32 & 0x3f) << 26;
271       code[1] |= 0xc000 | (u32 >> 6);
272    } else {
273       // float immediate
274       assert(!(u32 & 0x00000fff));
275       assert(!(code[1] & 0xc000));
276       code[0] |= ((u32 >> 12) & 0x3f) << 26;
277       code[1] |= 0xc000 | (u32 >> 18);
278    }
279 }
280 
setImmediateS8(const ValueRef & ref)281 void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref)
282 {
283    const ImmediateValue *imm = ref.get()->asImm();
284 
285    int8_t s8 = static_cast<int8_t>(imm->reg.data.s32);
286 
287    assert(s8 == imm->reg.data.s32);
288 
289    code[0] |= (s8 & 0x3f) << 26;
290    code[0] |= (s8 >> 6) << 8;
291 }
292 
293 void
emitForm_A(const Instruction * i,uint64_t opc)294 CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc)
295 {
296    code[0] = opc;
297    code[1] = opc >> 32;
298 
299    emitPredicate(i);
300 
301    defId(i->def(0), 14);
302 
303    int s1 = 26;
304    if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST)
305       s1 = 49;
306 
307    for (int s = 0; s < 3 && i->srcExists(s); ++s) {
308       switch (i->getSrc(s)->reg.file) {
309       case FILE_MEMORY_CONST:
310          assert(!(code[1] & 0xc000));
311          code[1] |= (s == 2) ? 0x8000 : 0x4000;
312          code[1] |= i->getSrc(s)->reg.fileIndex << 10;
313          setAddress16(i->src(s));
314          break;
315       case FILE_IMMEDIATE:
316          assert(s == 1 ||
317                 i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2);
318          assert(!(code[1] & 0xc000));
319          setImmediate(i, s);
320          break;
321       case FILE_GPR:
322          if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst
323             break;
324          srcId(i->src(s), s ? ((s == 2) ? 49 : s1) : 20);
325          break;
326       default:
327          // ignore here, can be predicate or flags, but must not be address
328          break;
329       }
330    }
331 }
332 
333 void
emitForm_B(const Instruction * i,uint64_t opc)334 CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc)
335 {
336    code[0] = opc;
337    code[1] = opc >> 32;
338 
339    emitPredicate(i);
340 
341    defId(i->def(0), 14);
342 
343    switch (i->src(0).getFile()) {
344    case FILE_MEMORY_CONST:
345       assert(!(code[1] & 0xc000));
346       code[1] |= 0x4000 | (i->src(0).get()->reg.fileIndex << 10);
347       setAddress16(i->src(0));
348       break;
349    case FILE_IMMEDIATE:
350       assert(!(code[1] & 0xc000));
351       setImmediate(i, 0);
352       break;
353    case FILE_GPR:
354       srcId(i->src(0), 26);
355       break;
356    default:
357       // ignore here, can be predicate or flags, but must not be address
358       break;
359    }
360 }
361 
362 void
emitForm_S(const Instruction * i,uint32_t opc,bool pred)363 CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred)
364 {
365    code[0] = opc;
366 
367    int ss2a = 0;
368    if (opc == 0x0d || opc == 0x0e)
369       ss2a = 2;
370 
371    defId(i->def(0), 14);
372    srcId(i->src(0), 20);
373 
374    assert(pred || (i->predSrc < 0));
375    if (pred)
376       emitPredicate(i);
377 
378    for (int s = 1; s < 3 && i->srcExists(s); ++s) {
379       if (i->src(s).get()->reg.file == FILE_MEMORY_CONST) {
380          assert(!(code[0] & (0x300 >> ss2a)));
381          switch (i->src(s).get()->reg.fileIndex) {
382          case 0:  code[0] |= 0x100 >> ss2a; break;
383          case 1:  code[0] |= 0x200 >> ss2a; break;
384          case 16: code[0] |= 0x300 >> ss2a; break;
385          default:
386             ERROR("invalid c[] space for short form\n");
387             break;
388          }
389          if (s == 1)
390             code[0] |= i->getSrc(s)->reg.data.offset << 24;
391          else
392             code[0] |= i->getSrc(s)->reg.data.offset << 6;
393       } else
394       if (i->src(s).getFile() == FILE_IMMEDIATE) {
395          assert(s == 1);
396          setImmediateS8(i->src(s));
397       } else
398       if (i->src(s).getFile() == FILE_GPR) {
399          srcId(i->src(s), (s == 1) ? 26 : 8);
400       }
401    }
402 }
403 
404 void
emitShortSrc2(const ValueRef & src)405 CodeEmitterNVC0::emitShortSrc2(const ValueRef &src)
406 {
407    if (src.getFile() == FILE_MEMORY_CONST) {
408       switch (src.get()->reg.fileIndex) {
409       case 0:  code[0] |= 0x100; break;
410       case 1:  code[0] |= 0x200; break;
411       case 16: code[0] |= 0x300; break;
412       default:
413          assert(!"unsupported file index for short op");
414          break;
415       }
416       srcAddr32(src, 20);
417    } else {
418       srcId(src, 20);
419       assert(src.getFile() == FILE_GPR);
420    }
421 }
422 
423 void
emitNOP(const Instruction * i)424 CodeEmitterNVC0::emitNOP(const Instruction *i)
425 {
426    code[0] = 0x000001e4;
427    code[1] = 0x40000000;
428    emitPredicate(i);
429 }
430 
431 void
emitFMAD(const Instruction * i)432 CodeEmitterNVC0::emitFMAD(const Instruction *i)
433 {
434    bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
435 
436    if (i->encSize == 8) {
437       if (isLIMM(i->src(1), TYPE_F32)) {
438          emitForm_A(i, HEX64(20000000, 00000002));
439       } else {
440          emitForm_A(i, HEX64(30000000, 00000000));
441 
442          if (i->src(2).mod.neg())
443             code[0] |= 1 << 8;
444       }
445       roundMode_A(i);
446 
447       if (neg1)
448          code[0] |= 1 << 9;
449 
450       if (i->saturate)
451          code[0] |= 1 << 5;
452       if (i->ftz)
453          code[0] |= 1 << 6;
454    } else {
455       assert(!i->saturate && !i->src(2).mod.neg());
456       emitForm_S(i, (i->src(2).getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e,
457                  false);
458       if (neg1)
459          code[0] |= 1 << 4;
460    }
461 }
462 
463 void
emitFMUL(const Instruction * i)464 CodeEmitterNVC0::emitFMUL(const Instruction *i)
465 {
466    bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
467 
468    assert(i->postFactor >= -3 && i->postFactor <= 3);
469 
470    if (i->encSize == 8) {
471       if (isLIMM(i->src(1), TYPE_F32)) {
472          assert(i->postFactor == 0); // constant folded, hopefully
473          emitForm_A(i, HEX64(30000000, 00000002));
474       } else {
475          emitForm_A(i, HEX64(58000000, 00000000));
476          roundMode_A(i);
477          code[1] |= ((i->postFactor > 0) ?
478                      (7 - i->postFactor) : (0 - i->postFactor)) << 17;
479       }
480       if (neg)
481          code[1] ^= 1 << 25; // aliases with LIMM sign bit
482 
483       if (i->saturate)
484          code[0] |= 1 << 5;
485 
486       if (i->dnz)
487          code[0] |= 1 << 7;
488       else
489       if (i->ftz)
490          code[0] |= 1 << 6;
491    } else {
492       assert(!neg && !i->saturate && !i->ftz && !i->postFactor);
493       emitForm_S(i, 0xa8, true);
494    }
495 }
496 
497 void
emitUMUL(const Instruction * i)498 CodeEmitterNVC0::emitUMUL(const Instruction *i)
499 {
500    if (i->encSize == 8) {
501       if (i->src(1).getFile() == FILE_IMMEDIATE) {
502          emitForm_A(i, HEX64(10000000, 00000002));
503       } else {
504          emitForm_A(i, HEX64(50000000, 00000003));
505       }
506       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
507          code[0] |= 1 << 6;
508       if (i->sType == TYPE_S32)
509          code[0] |= 1 << 5;
510       if (i->dType == TYPE_S32)
511          code[0] |= 1 << 7;
512    } else {
513       emitForm_S(i, i->src(1).getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true);
514 
515       if (i->sType == TYPE_S32)
516          code[0] |= 1 << 6;
517    }
518 }
519 
520 void
emitFADD(const Instruction * i)521 CodeEmitterNVC0::emitFADD(const Instruction *i)
522 {
523    if (i->encSize == 8) {
524       if (isLIMM(i->src(1), TYPE_F32)) {
525          assert(!i->saturate);
526          emitForm_A(i, HEX64(28000000, 00000002));
527 
528          code[0] |= i->src(0).mod.abs() << 7;
529          code[0] |= i->src(0).mod.neg() << 9;
530 
531          if (i->src(1).mod.abs())
532             code[1] &= 0xfdffffff;
533          if ((i->op == OP_SUB) != static_cast<bool>(i->src(1).mod.neg()))
534             code[1] ^= 0x02000000;
535       } else {
536          emitForm_A(i, HEX64(50000000, 00000000));
537 
538          roundMode_A(i);
539          if (i->saturate)
540             code[1] |= 1 << 17;
541 
542          emitNegAbs12(i);
543          if (i->op == OP_SUB) code[0] ^= 1 << 8;
544       }
545       if (i->ftz)
546          code[0] |= 1 << 5;
547    } else {
548       assert(!i->saturate && i->op != OP_SUB &&
549              !i->src(0).mod.abs() &&
550              !i->src(1).mod.neg() && !i->src(1).mod.abs());
551 
552       emitForm_S(i, 0x49, true);
553 
554       if (i->src(0).mod.neg())
555          code[0] |= 1 << 7;
556    }
557 }
558 
559 void
emitUADD(const Instruction * i)560 CodeEmitterNVC0::emitUADD(const Instruction *i)
561 {
562    uint32_t addOp = 0;
563 
564    assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
565    assert(!i->src(0).mod.neg() || !i->src(1).mod.neg());
566 
567    if (i->src(0).mod.neg())
568       addOp |= 0x200;
569    if (i->src(1).mod.neg())
570       addOp |= 0x100;
571    if (i->op == OP_SUB) {
572       addOp ^= 0x100;
573       assert(addOp != 0x300); // would be add-plus-one
574    }
575 
576    if (i->encSize == 8) {
577       if (isLIMM(i->src(1), TYPE_U32)) {
578          emitForm_A(i, HEX64(08000000, 00000002));
579          if (i->defExists(1))
580             code[1] |= 1 << 26; // write carry
581       } else {
582          emitForm_A(i, HEX64(48000000, 00000003));
583          if (i->defExists(1))
584             code[1] |= 1 << 16; // write carry
585       }
586       code[0] |= addOp;
587 
588       if (i->saturate)
589          code[0] |= 1 << 5;
590       if (i->flagsSrc >= 0) // add carry
591          code[0] |= 1 << 6;
592    } else {
593       assert(!(addOp & 0x100));
594       emitForm_S(i, (addOp >> 3) |
595                  ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true);
596    }
597 }
598 
599 // TODO: shl-add
600 void
emitIMAD(const Instruction * i)601 CodeEmitterNVC0::emitIMAD(const Instruction *i)
602 {
603    assert(i->encSize == 8);
604    emitForm_A(i, HEX64(20000000, 00000003));
605 
606    if (isSignedType(i->dType))
607       code[0] |= 1 << 7;
608    if (isSignedType(i->sType))
609       code[0] |= 1 << 5;
610 
611    code[1] |= i->saturate << 24;
612 
613    if (i->flagsDef >= 0) code[1] |= 1 << 16;
614    if (i->flagsSrc >= 0) code[1] |= 1 << 23;
615 
616    if (i->src(2).mod.neg()) code[0] |= 0x10;
617    if (i->src(1).mod.neg() ^
618        i->src(0).mod.neg()) code[0] |= 0x20;
619 
620    if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
621       code[0] |= 1 << 6;
622 }
623 
624 void
emitISAD(const Instruction * i)625 CodeEmitterNVC0::emitISAD(const Instruction *i)
626 {
627    assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
628    assert(i->encSize == 8);
629 
630    emitForm_A(i, HEX64(38000000, 00000003));
631 
632    if (i->dType == TYPE_S32)
633       code[0] |= 1 << 5;
634 }
635 
636 void
emitNOT(Instruction * i)637 CodeEmitterNVC0::emitNOT(Instruction *i)
638 {
639    assert(i->encSize == 8);
640    i->setSrc(1, i->src(0));
641    emitForm_A(i, HEX64(68000000, 000001c3));
642 }
643 
644 void
emitLogicOp(const Instruction * i,uint8_t subOp)645 CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp)
646 {
647    if (i->encSize == 8) {
648       if (isLIMM(i->src(1), TYPE_U32)) {
649          emitForm_A(i, HEX64(38000000, 00000002));
650 
651          if (i->srcExists(2))
652             code[1] |= 1 << 26;
653       } else {
654          emitForm_A(i, HEX64(68000000, 00000003));
655 
656          if (i->srcExists(2))
657             code[1] |= 1 << 16;
658       }
659       code[0] |= subOp << 6;
660 
661       if (i->srcExists(2)) // carry
662          code[0] |= 1 << 5;
663 
664       if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
665       if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
666    } else {
667       emitForm_S(i, (subOp << 5) |
668                  ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true);
669    }
670 }
671 
672 void
emitPOPC(const Instruction * i)673 CodeEmitterNVC0::emitPOPC(const Instruction *i)
674 {
675    emitForm_A(i, HEX64(54000000, 00000004));
676 
677    if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
678    if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
679 }
680 
681 void
emitINSBF(const Instruction * i)682 CodeEmitterNVC0::emitINSBF(const Instruction *i)
683 {
684    emitForm_A(i, HEX64(28000000, 30000000));
685 }
686 
687 void
emitShift(const Instruction * i)688 CodeEmitterNVC0::emitShift(const Instruction *i)
689 {
690    if (i->op == OP_SHR) {
691       emitForm_A(i, HEX64(58000000, 00000003)
692                  | (isSignedType(i->dType) ? 0x20 : 0x00));
693    } else {
694       emitForm_A(i, HEX64(60000000, 00000003));
695    }
696 
697    if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP)
698       code[0] |= 1 << 9;
699 }
700 
701 void
emitPreOp(const Instruction * i)702 CodeEmitterNVC0::emitPreOp(const Instruction *i)
703 {
704    if (i->encSize == 8) {
705       emitForm_B(i, HEX64(60000000, 00000000));
706 
707       if (i->op == OP_PREEX2)
708          code[0] |= 0x20;
709 
710       if (i->src(0).mod.abs()) code[0] |= 1 << 6;
711       if (i->src(0).mod.neg()) code[0] |= 1 << 8;
712    } else {
713       emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true);
714    }
715 }
716 
717 void
emitSFnOp(const Instruction * i,uint8_t subOp)718 CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp)
719 {
720    if (i->encSize == 8) {
721       code[0] = 0x00000000 | (subOp << 26);
722       code[1] = 0xc8000000;
723 
724       emitPredicate(i);
725 
726       defId(i->def(0), 14);
727       srcId(i->src(0), 20);
728 
729       assert(i->src(0).getFile() == FILE_GPR);
730 
731       if (i->saturate) code[0] |= 1 << 5;
732 
733       if (i->src(0).mod.abs()) code[0] |= 1 << 7;
734       if (i->src(0).mod.neg()) code[0] |= 1 << 9;
735    } else {
736       emitForm_S(i, 0x80000008 | (subOp << 26), true);
737 
738       assert(!i->src(0).mod.neg());
739       if (i->src(0).mod.abs()) code[0] |= 1 << 30;
740    }
741 }
742 
743 void
emitMINMAX(const Instruction * i)744 CodeEmitterNVC0::emitMINMAX(const Instruction *i)
745 {
746    uint64_t op;
747 
748    assert(i->encSize == 8);
749 
750    op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL;
751 
752    if (i->ftz)
753       op |= 1 << 5;
754    else
755    if (!isFloatType(i->dType))
756       op |= isSignedType(i->dType) ? 0x23 : 0x03;
757 
758    emitForm_A(i, op);
759    emitNegAbs12(i);
760 }
761 
762 void
roundMode_C(const Instruction * i)763 CodeEmitterNVC0::roundMode_C(const Instruction *i)
764 {
765    switch (i->rnd) {
766    case ROUND_M:  code[1] |= 1 << 17; break;
767    case ROUND_P:  code[1] |= 2 << 17; break;
768    case ROUND_Z:  code[1] |= 3 << 17; break;
769    case ROUND_NI: code[0] |= 1 << 7; break;
770    case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break;
771    case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break;
772    case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break;
773    case ROUND_N: break;
774    default:
775       assert(!"invalid round mode");
776       break;
777    }
778 }
779 
780 void
roundMode_CS(const Instruction * i)781 CodeEmitterNVC0::roundMode_CS(const Instruction *i)
782 {
783    switch (i->rnd) {
784    case ROUND_M:
785    case ROUND_MI: code[0] |= 1 << 16; break;
786    case ROUND_P:
787    case ROUND_PI: code[0] |= 2 << 16; break;
788    case ROUND_Z:
789    case ROUND_ZI: code[0] |= 3 << 16; break;
790    default:
791       break;
792    }
793 }
794 
795 void
emitCVT(Instruction * i)796 CodeEmitterNVC0::emitCVT(Instruction *i)
797 {
798    const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
799 
800    switch (i->op) {
801    case OP_CEIL:  i->rnd = f2f ? ROUND_PI : ROUND_P; break;
802    case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break;
803    case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break;
804    default:
805       break;
806    }
807 
808    const bool sat = (i->op == OP_SAT) || i->saturate;
809    const bool abs = (i->op == OP_ABS) || i->src(0).mod.abs();
810    const bool neg = (i->op == OP_NEG) || i->src(0).mod.neg();
811 
812    if (i->encSize == 8) {
813       emitForm_B(i, HEX64(10000000, 00000004));
814 
815       roundMode_C(i);
816 
817       // cvt u16 f32 sets high bits to 0, so we don't have to use Value::Size()
818       code[0] |= util_logbase2(typeSizeof(i->dType)) << 20;
819       code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
820 
821       if (sat)
822          code[0] |= 0x20;
823       if (abs)
824          code[0] |= 1 << 6;
825       if (neg && i->op != OP_ABS)
826          code[0] |= 1 << 8;
827 
828       if (i->ftz)
829          code[1] |= 1 << 23;
830 
831       if (isSignedIntType(i->dType))
832          code[0] |= 0x080;
833       if (isSignedIntType(i->sType))
834          code[0] |= 0x200;
835 
836       if (isFloatType(i->dType)) {
837          if (!isFloatType(i->sType))
838             code[1] |= 0x08000000;
839       } else {
840          if (isFloatType(i->sType))
841             code[1] |= 0x04000000;
842          else
843             code[1] |= 0x0c000000;
844       }
845    } else {
846       if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) {
847          code[0] = 0x298;
848       } else
849       if (isFloatType(i->dType)) {
850          if (isFloatType(i->sType))
851             code[0] = 0x098;
852          else
853             code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0);
854       } else {
855          assert(isFloatType(i->sType));
856 
857          code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0);
858       }
859 
860       if (neg) code[0] |= 1 << 16;
861       if (sat) code[0] |= 1 << 18;
862       if (abs) code[0] |= 1 << 19;
863 
864       roundMode_CS(i);
865    }
866 }
867 
868 void
emitSET(const CmpInstruction * i)869 CodeEmitterNVC0::emitSET(const CmpInstruction *i)
870 {
871    uint32_t hi;
872    uint32_t lo = 0;
873 
874    if (i->sType == TYPE_F64)
875       lo = 0x1;
876    else
877    if (!isFloatType(i->sType))
878       lo = 0x3;
879 
880    if (isFloatType(i->dType) || isSignedIntType(i->sType))
881       lo |= 0x20;
882 
883    switch (i->op) {
884    case OP_SET_AND: hi = 0x10000000; break;
885    case OP_SET_OR:  hi = 0x10200000; break;
886    case OP_SET_XOR: hi = 0x10400000; break;
887    default:
888       hi = 0x100e0000;
889       break;
890    }
891    emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo);
892 
893    if (i->op != OP_SET)
894       srcId(i->src(2), 32 + 17);
895 
896    if (i->def(0).getFile() == FILE_PREDICATE) {
897       if (i->sType == TYPE_F32)
898          code[1] += 0x10000000;
899       else
900          code[1] += 0x08000000;
901 
902       code[0] &= ~0xfc000;
903       defId(i->def(0), 17);
904       if (i->defExists(1))
905          defId(i->def(1), 14);
906       else
907          code[0] |= 0x1c000;
908    }
909 
910    if (i->ftz)
911       code[1] |= 1 << 27;
912 
913    emitCondCode(i->setCond, 32 + 23);
914    emitNegAbs12(i);
915 }
916 
917 void
emitSLCT(const CmpInstruction * i)918 CodeEmitterNVC0::emitSLCT(const CmpInstruction *i)
919 {
920    uint64_t op;
921 
922    switch (i->dType) {
923    case TYPE_S32:
924       op = HEX64(30000000, 00000023);
925       break;
926    case TYPE_U32:
927       op = HEX64(30000000, 00000003);
928       break;
929    case TYPE_F32:
930       op = HEX64(38000000, 00000000);
931       break;
932    default:
933       assert(!"invalid type for SLCT");
934       op = 0;
935       break;
936    }
937    emitForm_A(i, op);
938 
939    CondCode cc = i->setCond;
940 
941    if (i->src(2).mod.neg())
942       cc = reverseCondCode(cc);
943 
944    emitCondCode(cc, 32 + 23);
945 
946    if (i->ftz)
947       code[0] |= 1 << 5;
948 }
949 
emitSELP(const Instruction * i)950 void CodeEmitterNVC0::emitSELP(const Instruction *i)
951 {
952    emitForm_A(i, HEX64(20000000, 00000004));
953 
954    if (i->cc == CC_NOT_P || i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
955       code[1] |= 1 << 20;
956 }
957 
emitTEXBAR(const Instruction * i)958 void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
959 {
960    code[0] = 0x00000006 | (i->subOp << 26);
961    code[1] = 0xf0000000;
962    emitPredicate(i);
963    emitCondCode(i->flagsSrc >= 0 ? i->cc : CC_ALWAYS, 5);
964 }
965 
emitTEXCSAA(const TexInstruction * i)966 void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
967 {
968    code[0] = 0x00000086;
969    code[1] = 0xd0000000;
970 
971    code[1] |= i->tex.r;
972    code[1] |= i->tex.s << 8;
973 
974    if (i->tex.liveOnly)
975       code[0] |= 1 << 9;
976 
977    defId(i->def(0), 14);
978    srcId(i->src(0), 20);
979 }
980 
981 static inline bool
isNextIndependentTex(const TexInstruction * i)982 isNextIndependentTex(const TexInstruction *i)
983 {
984    if (!i->next || !isTextureOp(i->next->op))
985       return false;
986    if (i->getDef(0)->interfers(i->next->getSrc(0)))
987       return false;
988    return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
989 }
990 
991 void
emitTEX(const TexInstruction * i)992 CodeEmitterNVC0::emitTEX(const TexInstruction *i)
993 {
994    code[0] = 0x00000006;
995 
996    if (isNextIndependentTex(i))
997       code[0] |= 0x080; // t mode
998    else
999       code[0] |= 0x100; // p mode
1000 
1001    if (i->tex.liveOnly)
1002       code[0] |= 1 << 9;
1003 
1004    switch (i->op) {
1005    case OP_TEX: code[1] = 0x80000000; break;
1006    case OP_TXB: code[1] = 0x84000000; break;
1007    case OP_TXL: code[1] = 0x86000000; break;
1008    case OP_TXF: code[1] = 0x90000000; break;
1009    case OP_TXG: code[1] = 0xa0000000; break;
1010    case OP_TXD: code[1] = 0xe0000000; break;
1011    default:
1012       assert(!"invalid texture op");
1013       break;
1014    }
1015    if (i->op == OP_TXF) {
1016       if (!i->tex.levelZero)
1017          code[1] |= 0x02000000;
1018    } else
1019    if (i->tex.levelZero) {
1020       code[1] |= 0x02000000;
1021    }
1022 
1023    if (i->op != OP_TXD && i->tex.derivAll)
1024       code[1] |= 1 << 13;
1025 
1026    defId(i->def(0), 14);
1027    srcId(i->src(0), 20);
1028 
1029    emitPredicate(i);
1030 
1031    if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
1032 
1033    code[1] |= i->tex.mask << 14;
1034 
1035    code[1] |= i->tex.r;
1036    code[1] |= i->tex.s << 8;
1037    if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0)
1038       code[1] |= 1 << 18; // in 1st source (with array index)
1039 
1040    // texture target:
1041    code[1] |= (i->tex.target.getDim() - 1) << 20;
1042    if (i->tex.target.isCube())
1043       code[1] += 2 << 20;
1044    if (i->tex.target.isArray())
1045       code[1] |= 1 << 19;
1046    if (i->tex.target.isShadow())
1047       code[1] |= 1 << 24;
1048 
1049    const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
1050 
1051    if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
1052       // lzero
1053       if (i->op == OP_TXL)
1054          code[1] &= ~(1 << 26);
1055       else
1056       if (i->op == OP_TXF)
1057          code[1] &= ~(1 << 25);
1058    }
1059    if (i->tex.target == TEX_TARGET_2D_MS ||
1060        i->tex.target == TEX_TARGET_2D_MS_ARRAY)
1061       code[1] |= 1 << 23;
1062 
1063    if (i->tex.useOffsets) // in vecSrc0.w
1064       code[1] |= 1 << 22;
1065 
1066    srcId(i, src1, 26);
1067 }
1068 
1069 void
emitTXQ(const TexInstruction * i)1070 CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
1071 {
1072    code[0] = 0x00000086;
1073    code[1] = 0xc0000000;
1074 
1075    switch (i->tex.query) {
1076    case TXQ_DIMS:            code[1] |= 0 << 22; break;
1077    case TXQ_TYPE:            code[1] |= 1 << 22; break;
1078    case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break;
1079    case TXQ_FILTER:          code[1] |= 3 << 22; break;
1080    case TXQ_LOD:             code[1] |= 4 << 22; break;
1081    case TXQ_BORDER_COLOUR:   code[1] |= 5 << 22; break;
1082    default:
1083       assert(!"invalid texture query");
1084       break;
1085    }
1086 
1087    code[1] |= i->tex.mask << 14;
1088 
1089    code[1] |= i->tex.r;
1090    code[1] |= i->tex.s << 8;
1091    if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0)
1092       code[1] |= 1 << 18;
1093 
1094    const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
1095 
1096    defId(i->def(0), 14);
1097    srcId(i->src(0), 20);
1098    srcId(i, src1, 26);
1099 
1100    emitPredicate(i);
1101 }
1102 
1103 void
emitQUADOP(const Instruction * i,uint8_t qOp,uint8_t laneMask)1104 CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
1105 {
1106    code[0] = 0x00000000 | (laneMask << 6);
1107    code[1] = 0x48000000 | qOp;
1108 
1109    defId(i->def(0), 14);
1110    srcId(i->src(0), 20);
1111    srcId(i->srcExists(1) ? i->src(1) : i->src(0), 26);
1112 
1113    if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
1114       code[0] |= 1 << 9; // dall
1115 
1116    emitPredicate(i);
1117 }
1118 
1119 void
emitFlow(const Instruction * i)1120 CodeEmitterNVC0::emitFlow(const Instruction *i)
1121 {
1122    const FlowInstruction *f = i->asFlow();
1123 
1124    unsigned mask; // bit 0: predicate, bit 1: target
1125 
1126    code[0] = 0x00000007;
1127 
1128    switch (i->op) {
1129    case OP_BRA:
1130       code[1] = f->absolute ? 0x00000000 : 0x40000000;
1131       if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
1132          code[0] |= 0x4000;
1133       mask = 3;
1134       break;
1135    case OP_CALL:
1136       code[1] = f->absolute ? 0x10000000 : 0x50000000;
1137       if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
1138          code[0] |= 0x4000;
1139       mask = 2;
1140       break;
1141 
1142    case OP_EXIT:    code[1] = 0x80000000; mask = 1; break;
1143    case OP_RET:     code[1] = 0x90000000; mask = 1; break;
1144    case OP_DISCARD: code[1] = 0x98000000; mask = 1; break;
1145    case OP_BREAK:   code[1] = 0xa8000000; mask = 1; break;
1146    case OP_CONT:    code[1] = 0xb0000000; mask = 1; break;
1147 
1148    case OP_JOINAT:   code[1] = 0x60000000; mask = 2; break;
1149    case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break;
1150    case OP_PRECONT:  code[1] = 0x70000000; mask = 2; break;
1151    case OP_PRERET:   code[1] = 0x78000000; mask = 2; break;
1152 
1153    case OP_QUADON:  code[1] = 0xc0000000; mask = 0; break;
1154    case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break;
1155    case OP_BRKPT:   code[1] = 0xd0000000; mask = 0; break;
1156    default:
1157       assert(!"invalid flow operation");
1158       return;
1159    }
1160 
1161    if (mask & 1) {
1162       emitPredicate(i);
1163       if (i->flagsSrc < 0)
1164          code[0] |= 0x1e0;
1165    }
1166 
1167    if (!f)
1168       return;
1169 
1170    if (f->allWarp)
1171       code[0] |= 1 << 15;
1172    if (f->limit)
1173       code[0] |= 1 << 16;
1174 
1175    if (f->op == OP_CALL) {
1176       if (f->builtin) {
1177          assert(f->absolute);
1178          uint32_t pcAbs = targ->getBuiltinOffset(f->target.builtin);
1179          addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26);
1180          addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6);
1181       } else {
1182          assert(!f->absolute);
1183          int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
1184          code[0] |= (pcRel & 0x3f) << 26;
1185          code[1] |= (pcRel >> 6) & 0x3ffff;
1186       }
1187    } else
1188    if (mask & 2) {
1189       int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
1190       // currently we don't want absolute branches
1191       assert(!f->absolute);
1192       code[0] |= (pcRel & 0x3f) << 26;
1193       code[1] |= (pcRel >> 6) & 0x3ffff;
1194    }
1195 }
1196 
1197 void
emitPFETCH(const Instruction * i)1198 CodeEmitterNVC0::emitPFETCH(const Instruction *i)
1199 {
1200    uint32_t prim = i->src(0).get()->reg.data.u32;
1201 
1202    code[0] = 0x00000006 | ((prim & 0x3f) << 26);
1203    code[1] = 0x00000000 | (prim >> 6);
1204 
1205    emitPredicate(i);
1206 
1207    defId(i->def(0), 14);
1208    srcId(i->src(1), 20);
1209 }
1210 
1211 void
emitVFETCH(const Instruction * i)1212 CodeEmitterNVC0::emitVFETCH(const Instruction *i)
1213 {
1214    code[0] = 0x00000006;
1215    code[1] = 0x06000000 | i->src(0).get()->reg.data.offset;
1216 
1217    if (i->perPatch)
1218       code[0] |= 0x100;
1219    if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
1220       code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
1221 
1222    emitPredicate(i);
1223 
1224    code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5;
1225 
1226    defId(i->def(0), 14);
1227    srcId(i->src(0).getIndirect(0), 20);
1228    srcId(i->src(0).getIndirect(1), 26); // vertex address
1229 }
1230 
1231 void
emitEXPORT(const Instruction * i)1232 CodeEmitterNVC0::emitEXPORT(const Instruction *i)
1233 {
1234    unsigned int size = typeSizeof(i->dType);
1235 
1236    code[0] = 0x00000006 | ((size / 4 - 1) << 5);
1237    code[1] = 0x0a000000 | i->src(0).get()->reg.data.offset;
1238 
1239    assert(!(code[1] & ((size == 12) ? 15 : (size - 1))));
1240 
1241    if (i->perPatch)
1242       code[0] |= 0x100;
1243 
1244    emitPredicate(i);
1245 
1246    assert(i->src(1).getFile() == FILE_GPR);
1247 
1248    srcId(i->src(0).getIndirect(0), 20);
1249    srcId(i->src(0).getIndirect(1), 32 + 17); // vertex base address
1250    srcId(i->src(1), 26);
1251 }
1252 
1253 void
emitOUT(const Instruction * i)1254 CodeEmitterNVC0::emitOUT(const Instruction *i)
1255 {
1256    code[0] = 0x00000006;
1257    code[1] = 0x1c000000;
1258 
1259    emitPredicate(i);
1260 
1261    defId(i->def(0), 14); // new secret address
1262    srcId(i->src(0), 20); // old secret address, should be 0 initially
1263 
1264    assert(i->src(0).getFile() == FILE_GPR);
1265 
1266    if (i->op == OP_EMIT)
1267       code[0] |= 1 << 5;
1268    if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART)
1269       code[0] |= 1 << 6;
1270 
1271    // vertex stream
1272    if (i->src(1).getFile() == FILE_IMMEDIATE) {
1273       code[1] |= 0xc000;
1274       code[0] |= SDATA(i->src(1)).u32 << 26;
1275    } else {
1276       srcId(i->src(1), 26);
1277    }
1278 }
1279 
1280 void
emitInterpMode(const Instruction * i)1281 CodeEmitterNVC0::emitInterpMode(const Instruction *i)
1282 {
1283    if (i->encSize == 8) {
1284       code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID
1285    } else {
1286       if (i->getInterpMode() == NV50_IR_INTERP_SC)
1287          code[0] |= 0x80;
1288       assert(i->op == OP_PINTERP && i->getSampleMode() == 0);
1289    }
1290 }
1291 
1292 void
emitINTERP(const Instruction * i)1293 CodeEmitterNVC0::emitINTERP(const Instruction *i)
1294 {
1295    const uint32_t base = i->getSrc(0)->reg.data.offset;
1296 
1297    if (i->encSize == 8) {
1298       code[0] = 0x00000000;
1299       code[1] = 0xc0000000 | (base & 0xffff);
1300 
1301       if (i->saturate)
1302          code[0] |= 1 << 5;
1303 
1304       if (i->op == OP_PINTERP)
1305          srcId(i->src(1), 26);
1306       else
1307          code[0] |= 0x3f << 26;
1308 
1309       srcId(i->src(0).getIndirect(0), 20);
1310    } else {
1311       assert(i->op == OP_PINTERP);
1312       code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26);
1313       srcId(i->src(1), 20);
1314    }
1315    emitInterpMode(i);
1316 
1317    emitPredicate(i);
1318    defId(i->def(0), 14);
1319 
1320    if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
1321       srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 17);
1322    else
1323       code[1] |= 0x3f << 17;
1324 }
1325 
1326 void
emitLoadStoreType(DataType ty)1327 CodeEmitterNVC0::emitLoadStoreType(DataType ty)
1328 {
1329    uint8_t val;
1330 
1331    switch (ty) {
1332    case TYPE_U8:
1333       val = 0x00;
1334       break;
1335    case TYPE_S8:
1336       val = 0x20;
1337       break;
1338    case TYPE_F16:
1339    case TYPE_U16:
1340       val = 0x40;
1341       break;
1342    case TYPE_S16:
1343       val = 0x60;
1344       break;
1345    case TYPE_F32:
1346    case TYPE_U32:
1347    case TYPE_S32:
1348       val = 0x80;
1349       break;
1350    case TYPE_F64:
1351    case TYPE_U64:
1352    case TYPE_S64:
1353       val = 0xa0;
1354       break;
1355    case TYPE_B128:
1356       val = 0xc0;
1357       break;
1358    default:
1359       val = 0x80;
1360       assert(!"invalid type");
1361       break;
1362    }
1363    code[0] |= val;
1364 }
1365 
1366 void
emitCachingMode(CacheMode c)1367 CodeEmitterNVC0::emitCachingMode(CacheMode c)
1368 {
1369    uint32_t val;
1370 
1371    switch (c) {
1372    case CACHE_CA:
1373 // case CACHE_WB:
1374       val = 0x000;
1375       break;
1376    case CACHE_CG:
1377       val = 0x100;
1378       break;
1379    case CACHE_CS:
1380       val = 0x200;
1381       break;
1382    case CACHE_CV:
1383 // case CACHE_WT:
1384       val = 0x300;
1385       break;
1386    default:
1387       val = 0;
1388       assert(!"invalid caching mode");
1389       break;
1390    }
1391    code[0] |= val;
1392 }
1393 
1394 void
emitSTORE(const Instruction * i)1395 CodeEmitterNVC0::emitSTORE(const Instruction *i)
1396 {
1397    uint32_t opc;
1398 
1399    switch (i->src(0).getFile()) {
1400    case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
1401    case FILE_MEMORY_LOCAL:  opc = 0xc8000000; break;
1402    case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
1403    default:
1404       assert(!"invalid memory file");
1405       opc = 0;
1406       break;
1407    }
1408    code[0] = 0x00000005;
1409    code[1] = opc;
1410 
1411    setAddress16(i->src(0));
1412    srcId(i->src(1), 14);
1413    srcId(i->src(0).getIndirect(0), 20);
1414 
1415    emitPredicate(i);
1416 
1417    emitLoadStoreType(i->dType);
1418    emitCachingMode(i->cache);
1419 }
1420 
1421 void
emitLOAD(const Instruction * i)1422 CodeEmitterNVC0::emitLOAD(const Instruction *i)
1423 {
1424    uint32_t opc;
1425 
1426    code[0] = 0x00000005;
1427 
1428    switch (i->src(0).getFile()) {
1429    case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
1430    case FILE_MEMORY_LOCAL:  opc = 0xc0000000; break;
1431    case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
1432    case FILE_MEMORY_CONST:
1433       if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
1434          emitMOV(i); // not sure if this is any better
1435          return;
1436       }
1437       opc = 0x14000000 | (i->src(0).get()->reg.fileIndex << 10);
1438       code[0] = 0x00000006 | (i->subOp << 8);
1439       break;
1440    default:
1441       assert(!"invalid memory file");
1442       opc = 0;
1443       break;
1444    }
1445    code[1] = opc;
1446 
1447    defId(i->def(0), 14);
1448 
1449    setAddress16(i->src(0));
1450    srcId(i->src(0).getIndirect(0), 20);
1451 
1452    emitPredicate(i);
1453 
1454    emitLoadStoreType(i->dType);
1455    emitCachingMode(i->cache);
1456 }
1457 
1458 uint8_t
getSRegEncoding(const ValueRef & ref)1459 CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
1460 {
1461    switch (SDATA(ref).sv.sv) {
1462    case SV_LANEID:        return 0x00;
1463    case SV_PHYSID:        return 0x03;
1464    case SV_VERTEX_COUNT:  return 0x10;
1465    case SV_INVOCATION_ID: return 0x11;
1466    case SV_YDIR:          return 0x12;
1467    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
1468    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
1469    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
1470    case SV_GRIDID:        return 0x2c;
1471    case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
1472    case SV_LBASE:         return 0x34;
1473    case SV_SBASE:         return 0x30;
1474    case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
1475    default:
1476       assert(!"no sreg for system value");
1477       return 0;
1478    }
1479 }
1480 
1481 void
emitMOV(const Instruction * i)1482 CodeEmitterNVC0::emitMOV(const Instruction *i)
1483 {
1484    if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
1485       uint8_t sr = getSRegEncoding(i->src(0));
1486 
1487       if (i->encSize == 8) {
1488          code[0] = 0x00000004 | (sr << 26);
1489          code[1] = 0x2c000000;
1490       } else {
1491          code[0] = 0x40000008 | (sr << 20);
1492       }
1493       defId(i->def(0), 14);
1494 
1495       emitPredicate(i);
1496    } else
1497    if (i->encSize == 8) {
1498       uint64_t opc;
1499 
1500       if (i->src(0).getFile() == FILE_IMMEDIATE)
1501          opc = HEX64(18000000, 000001e2);
1502       else
1503       if (i->src(0).getFile() == FILE_PREDICATE)
1504          opc = HEX64(080e0000, 1c000004);
1505       else
1506          opc = HEX64(28000000, 00000004);
1507 
1508       opc |= i->lanes << 5;
1509 
1510       emitForm_B(i, opc);
1511    } else {
1512       uint32_t imm;
1513 
1514       if (i->src(0).getFile() == FILE_IMMEDIATE) {
1515          imm = SDATA(i->src(0)).u32;
1516          if (imm & 0xfff00000) {
1517             assert(!(imm & 0x000fffff));
1518             code[0] = 0x00000318 | imm;
1519          } else {
1520             assert(imm < 0x800 || ((int32_t)imm >= -0x800));
1521             code[0] = 0x00000118 | (imm << 20);
1522          }
1523       } else {
1524          code[0] = 0x0028;
1525          emitShortSrc2(i->src(0));
1526       }
1527       defId(i->def(0), 14);
1528 
1529       emitPredicate(i);
1530    }
1531 }
1532 
1533 bool
emitInstruction(Instruction * insn)1534 CodeEmitterNVC0::emitInstruction(Instruction *insn)
1535 {
1536    unsigned int size = insn->encSize;
1537 
1538    if (writeIssueDelays && !(codeSize & 0x3f))
1539       size += 8;
1540 
1541    if (!insn->encSize) {
1542       ERROR("skipping unencodable instruction: "); insn->print();
1543       return false;
1544    } else
1545    if (codeSize + size > codeSizeLimit) {
1546       ERROR("code emitter output buffer too small\n");
1547       return false;
1548    }
1549 
1550    if (writeIssueDelays) {
1551       if (!(codeSize & 0x3f)) {
1552          code[0] = 0x00000007; // cf issue delay "instruction"
1553          code[1] = 0x20000000;
1554          code += 2;
1555          codeSize += 8;
1556       }
1557       const unsigned int id = (codeSize & 0x3f) / 8 - 1;
1558       uint32_t *data = code - (id * 2 + 2);
1559       if (id <= 2) {
1560          data[0] |= insn->sched << (id * 8 + 4);
1561       } else
1562       if (id == 3) {
1563          data[0] |= insn->sched << 28;
1564          data[1] |= insn->sched >> 4;
1565       } else {
1566          data[1] |= insn->sched << ((id - 4) * 8 + 4);
1567       }
1568    }
1569 
1570    // assert that instructions with multiple defs don't corrupt registers
1571    for (int d = 0; insn->defExists(d); ++d)
1572       assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
1573 
1574    switch (insn->op) {
1575    case OP_MOV:
1576    case OP_RDSV:
1577       emitMOV(insn);
1578       break;
1579    case OP_NOP:
1580       break;
1581    case OP_LOAD:
1582       emitLOAD(insn);
1583       break;
1584    case OP_STORE:
1585       emitSTORE(insn);
1586       break;
1587    case OP_LINTERP:
1588    case OP_PINTERP:
1589       emitINTERP(insn);
1590       break;
1591    case OP_VFETCH:
1592       emitVFETCH(insn);
1593       break;
1594    case OP_EXPORT:
1595       emitEXPORT(insn);
1596       break;
1597    case OP_PFETCH:
1598       emitPFETCH(insn);
1599       break;
1600    case OP_EMIT:
1601    case OP_RESTART:
1602       emitOUT(insn);
1603       break;
1604    case OP_ADD:
1605    case OP_SUB:
1606       if (isFloatType(insn->dType))
1607          emitFADD(insn);
1608       else
1609          emitUADD(insn);
1610       break;
1611    case OP_MUL:
1612       if (isFloatType(insn->dType))
1613          emitFMUL(insn);
1614       else
1615          emitUMUL(insn);
1616       break;
1617    case OP_MAD:
1618    case OP_FMA:
1619       if (isFloatType(insn->dType))
1620          emitFMAD(insn);
1621       else
1622          emitIMAD(insn);
1623       break;
1624    case OP_SAD:
1625       emitISAD(insn);
1626       break;
1627    case OP_NOT:
1628       emitNOT(insn);
1629       break;
1630    case OP_AND:
1631       emitLogicOp(insn, 0);
1632       break;
1633    case OP_OR:
1634       emitLogicOp(insn, 1);
1635       break;
1636    case OP_XOR:
1637       emitLogicOp(insn, 2);
1638       break;
1639    case OP_SHL:
1640    case OP_SHR:
1641       emitShift(insn);
1642       break;
1643    case OP_SET:
1644    case OP_SET_AND:
1645    case OP_SET_OR:
1646    case OP_SET_XOR:
1647       emitSET(insn->asCmp());
1648       break;
1649    case OP_SELP:
1650       emitSELP(insn);
1651       break;
1652    case OP_SLCT:
1653       emitSLCT(insn->asCmp());
1654       break;
1655    case OP_MIN:
1656    case OP_MAX:
1657       emitMINMAX(insn);
1658       break;
1659    case OP_ABS:
1660    case OP_NEG:
1661    case OP_CEIL:
1662    case OP_FLOOR:
1663    case OP_TRUNC:
1664    case OP_CVT:
1665    case OP_SAT:
1666       emitCVT(insn);
1667       break;
1668    case OP_RSQ:
1669       emitSFnOp(insn, 5);
1670       break;
1671    case OP_RCP:
1672       emitSFnOp(insn, 4);
1673       break;
1674    case OP_LG2:
1675       emitSFnOp(insn, 3);
1676       break;
1677    case OP_EX2:
1678       emitSFnOp(insn, 2);
1679       break;
1680    case OP_SIN:
1681       emitSFnOp(insn, 1);
1682       break;
1683    case OP_COS:
1684       emitSFnOp(insn, 0);
1685       break;
1686    case OP_PRESIN:
1687    case OP_PREEX2:
1688       emitPreOp(insn);
1689       break;
1690    case OP_TEX:
1691    case OP_TXB:
1692    case OP_TXL:
1693    case OP_TXD:
1694    case OP_TXF:
1695       emitTEX(insn->asTex());
1696       break;
1697    case OP_TXQ:
1698       emitTXQ(insn->asTex());
1699       break;
1700    case OP_TEXBAR:
1701       emitTEXBAR(insn);
1702       break;
1703    case OP_BRA:
1704    case OP_CALL:
1705    case OP_PRERET:
1706    case OP_RET:
1707    case OP_DISCARD:
1708    case OP_EXIT:
1709    case OP_PRECONT:
1710    case OP_CONT:
1711    case OP_PREBREAK:
1712    case OP_BREAK:
1713    case OP_JOINAT:
1714    case OP_BRKPT:
1715    case OP_QUADON:
1716    case OP_QUADPOP:
1717       emitFlow(insn);
1718       break;
1719    case OP_QUADOP:
1720       emitQUADOP(insn, insn->subOp, insn->lanes);
1721       break;
1722    case OP_DFDX:
1723       emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
1724       break;
1725    case OP_DFDY:
1726       emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
1727       break;
1728    case OP_POPCNT:
1729       emitPOPC(insn);
1730       break;
1731    case OP_JOIN:
1732       emitNOP(insn);
1733       insn->join = 1;
1734       break;
1735    case OP_PHI:
1736    case OP_UNION:
1737    case OP_CONSTRAINT:
1738       ERROR("operation should have been eliminated");
1739       return false;
1740    case OP_EXP:
1741    case OP_LOG:
1742    case OP_SQRT:
1743    case OP_POW:
1744       ERROR("operation should have been lowered\n");
1745       return false;
1746    default:
1747       ERROR("unknow op\n");
1748       return false;
1749    }
1750 
1751    if (insn->join) {
1752       code[0] |= 0x10;
1753       assert(insn->encSize == 8);
1754    }
1755 
1756    code += insn->encSize / 4;
1757    codeSize += insn->encSize;
1758    return true;
1759 }
1760 
1761 uint32_t
getMinEncodingSize(const Instruction * i) const1762 CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
1763 {
1764    const Target::OpInfo &info = targ->getOpInfo(i);
1765 
1766    if (writeIssueDelays || info.minEncSize == 8 || 1)
1767       return 8;
1768 
1769    if (i->ftz || i->saturate || i->join)
1770       return 8;
1771    if (i->rnd != ROUND_N)
1772       return 8;
1773    if (i->predSrc >= 0 && i->op == OP_MAD)
1774       return 8;
1775 
1776    if (i->op == OP_PINTERP) {
1777       if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
1778          return 8;
1779    } else
1780    if (i->op == OP_MOV && i->lanes != 0xf) {
1781       return 8;
1782    }
1783 
1784    for (int s = 0; i->srcExists(s); ++s) {
1785       if (i->src(s).isIndirect(0))
1786          return 8;
1787 
1788       if (i->src(s).getFile() == FILE_MEMORY_CONST) {
1789          if (SDATA(i->src(s)).offset >= 0x100)
1790             return 8;
1791          if (i->getSrc(s)->reg.fileIndex > 1 &&
1792              i->getSrc(s)->reg.fileIndex != 16)
1793              return 8;
1794       } else
1795       if (i->src(s).getFile() == FILE_IMMEDIATE) {
1796          if (i->dType == TYPE_F32) {
1797             if (SDATA(i->src(s)).u32 >= 0x100)
1798                return 8;
1799          } else {
1800             if (SDATA(i->src(s)).u32 > 0xff)
1801                return 8;
1802          }
1803       }
1804 
1805       if (i->op == OP_CVT)
1806          continue;
1807       if (i->src(s).mod != Modifier(0)) {
1808          if (i->src(s).mod == Modifier(NV50_IR_MOD_ABS))
1809             if (i->op != OP_RSQ)
1810                return 8;
1811          if (i->src(s).mod == Modifier(NV50_IR_MOD_NEG))
1812             if (i->op != OP_ADD || s != 0)
1813                return 8;
1814       }
1815    }
1816 
1817    return 4;
1818 }
1819 
1820 // Simplified, erring on safe side.
1821 class SchedDataCalculator : public Pass
1822 {
1823 public:
SchedDataCalculator(const Target * targ)1824    SchedDataCalculator(const Target *targ) : targ(targ) { }
1825 
1826 private:
1827    struct RegScores
1828    {
1829       struct Resource {
1830          int st[DATA_FILE_COUNT]; // LD to LD delay 3
1831          int ld[DATA_FILE_COUNT]; // ST to ST delay 3
1832          int tex; // TEX to non-TEX delay 17 (0x11)
1833          int sfu; // SFU to SFU delay 3 (except PRE-ops)
1834          int imul; // integer MUL to MUL delay 3
1835       } res;
1836       struct ScoreData {
1837          int r[64];
1838          int p[8];
1839          int c;
1840       } rd, wr;
1841       int base;
1842 
rebasenv50_ir::SchedDataCalculator::RegScores1843       void rebase(const int base)
1844       {
1845          const int delta = this->base - base;
1846          if (!delta)
1847             return;
1848          this->base = 0;
1849 
1850          for (int i = 0; i < 64; ++i) {
1851             rd.r[i] += delta;
1852             wr.r[i] += delta;
1853          }
1854          for (int i = 0; i < 8; ++i) {
1855             rd.p[i] += delta;
1856             wr.p[i] += delta;
1857          }
1858          rd.c += delta;
1859          wr.c += delta;
1860 
1861          for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
1862             res.ld[f] += delta;
1863             res.st[f] += delta;
1864          }
1865          res.sfu += delta;
1866          res.imul += delta;
1867          res.tex += delta;
1868       }
wipenv50_ir::SchedDataCalculator::RegScores1869       void wipe()
1870       {
1871          memset(&rd, 0, sizeof(rd));
1872          memset(&wr, 0, sizeof(wr));
1873          memset(&res, 0, sizeof(res));
1874       }
getLatestnv50_ir::SchedDataCalculator::RegScores1875       int getLatest(const ScoreData& d) const
1876       {
1877          int max = 0;
1878          for (int i = 0; i < 64; ++i)
1879             if (d.r[i] > max)
1880                max = d.r[i];
1881          for (int i = 0; i < 8; ++i)
1882             if (d.p[i] > max)
1883                max = d.p[i];
1884          if (d.c > max)
1885             max = d.c;
1886          return max;
1887       }
getLatestRdnv50_ir::SchedDataCalculator::RegScores1888       inline int getLatestRd() const
1889       {
1890          return getLatest(rd);
1891       }
getLatestWrnv50_ir::SchedDataCalculator::RegScores1892       inline int getLatestWr() const
1893       {
1894          return getLatest(wr);
1895       }
getLatestnv50_ir::SchedDataCalculator::RegScores1896       inline int getLatest() const
1897       {
1898          const int a = getLatestRd();
1899          const int b = getLatestWr();
1900 
1901          int max = MAX2(a, b);
1902          for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
1903             max = MAX2(res.ld[f], max);
1904             max = MAX2(res.st[f], max);
1905          }
1906          max = MAX2(res.sfu, max);
1907          max = MAX2(res.imul, max);
1908          max = MAX2(res.tex, max);
1909          return max;
1910       }
setMaxnv50_ir::SchedDataCalculator::RegScores1911       void setMax(const RegScores *that)
1912       {
1913          for (int i = 0; i < 64; ++i) {
1914             rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
1915             wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
1916          }
1917          for (int i = 0; i < 8; ++i) {
1918             rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
1919             wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
1920          }
1921          rd.c = MAX2(rd.c, that->rd.c);
1922          wr.c = MAX2(wr.c, that->wr.c);
1923 
1924          for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
1925             res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
1926             res.st[f] = MAX2(res.st[f], that->res.st[f]);
1927          }
1928          res.sfu = MAX2(res.sfu, that->res.sfu);
1929          res.imul = MAX2(res.imul, that->res.imul);
1930          res.tex = MAX2(res.tex, that->res.tex);
1931       }
printnv50_ir::SchedDataCalculator::RegScores1932       void print(int cycle)
1933       {
1934          for (int i = 0; i < 64; ++i) {
1935             if (rd.r[i] > cycle)
1936                INFO("rd $r%i @ %i\n", i, rd.r[i]);
1937             if (wr.r[i] > cycle)
1938                INFO("wr $r%i @ %i\n", i, wr.r[i]);
1939          }
1940          for (int i = 0; i < 8; ++i) {
1941             if (rd.p[i] > cycle)
1942                INFO("rd $p%i @ %i\n", i, rd.p[i]);
1943             if (wr.p[i] > cycle)
1944                INFO("wr $p%i @ %i\n", i, wr.p[i]);
1945          }
1946          if (rd.c > cycle)
1947             INFO("rd $c @ %i\n", rd.c);
1948          if (wr.c > cycle)
1949             INFO("wr $c @ %i\n", wr.c);
1950          if (res.sfu > cycle)
1951             INFO("sfu @ %i\n", res.sfu);
1952          if (res.imul > cycle)
1953             INFO("imul @ %i\n", res.imul);
1954          if (res.tex > cycle)
1955             INFO("tex @ %i\n", res.tex);
1956       }
1957    };
1958 
1959    RegScores *score; // for current BB
1960    std::vector<RegScores> scoreBoards;
1961    int cycle;
1962    int prevData;
1963    operation prevOp;
1964 
1965    const Target *targ;
1966 
1967    bool visit(Function *);
1968    bool visit(BasicBlock *);
1969 
1970    void commitInsn(const Instruction *, int cycle);
1971    int calcDelay(const Instruction *, int cycle) const;
1972    void setDelay(Instruction *, int delay, Instruction *next);
1973 
1974    void recordRd(const Value *, const int ready);
1975    void recordWr(const Value *, const int ready);
1976    void checkRd(const Value *, int cycle, int& delay) const;
1977    void checkWr(const Value *, int cycle, int& delay) const;
1978 
1979    int getCycles(const Instruction *, int origDelay) const;
1980 };
1981 
1982 void
setDelay(Instruction * insn,int delay,Instruction * next)1983 SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
1984 {
1985    if (insn->op == OP_EXIT)
1986       delay = MAX2(delay, 14);
1987 
1988    if (insn->op == OP_TEXBAR) {
1989       // TODO: except if results not used before EXIT
1990       insn->sched = 0xc2;
1991    } else
1992    if (insn->op == OP_JOIN || insn->join) {
1993       insn->sched = 0x00;
1994    } else
1995    if (delay >= 0 || prevData == 0x04 ||
1996        !next || !targ->canDualIssue(insn, next)) {
1997       insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
1998       if (prevOp == OP_EXPORT)
1999          insn->sched |= 0x40;
2000       else
2001          insn->sched |= 0x20;
2002    } else {
2003       insn->sched = 0x04; // dual-issue
2004    }
2005 
2006    if (prevData != 0x04 || prevOp != OP_EXPORT)
2007       if (insn->sched != 0x04 || insn->op == OP_EXPORT)
2008          prevOp = insn->op;
2009 
2010    prevData = insn->sched;
2011 }
2012 
2013 int
getCycles(const Instruction * insn,int origDelay) const2014 SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
2015 {
2016    if (insn->sched & 0x80) {
2017       int c = (insn->sched & 0x0f) * 2 + 1;
2018       if (insn->op == OP_TEXBAR && origDelay > 0)
2019          c += origDelay;
2020       return c;
2021    }
2022    if (insn->sched & 0x60)
2023       return (insn->sched & 0x1f) + 1;
2024    return (insn->sched == 0x04) ? 0 : 32;
2025 }
2026 
2027 bool
visit(Function * func)2028 SchedDataCalculator::visit(Function *func)
2029 {
2030    scoreBoards.resize(func->cfg.getSize());
2031    for (size_t i = 0; i < scoreBoards.size(); ++i)
2032       scoreBoards[i].wipe();
2033    return true;
2034 }
2035 
2036 bool
visit(BasicBlock * bb)2037 SchedDataCalculator::visit(BasicBlock *bb)
2038 {
2039    Instruction *insn;
2040    Instruction *next = NULL;
2041 
2042    int cycle = 0;
2043 
2044    prevData = 0x00;
2045    prevOp = OP_NOP;
2046    score = &scoreBoards.at(bb->getId());
2047 
2048    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
2049       BasicBlock *in = BasicBlock::get(ei.getNode());
2050       if (in->getExit()) {
2051          if (prevData != 0x04)
2052             prevData = in->getExit()->sched;
2053          prevOp = in->getExit()->op;
2054       }
2055       if (ei.getType() != Graph::Edge::BACK)
2056          score->setMax(&scoreBoards.at(in->getId()));
2057       // back branches will wait until all target dependencies are satisfied
2058    }
2059    if (bb->cfg.incidentCount() > 1)
2060       prevOp = OP_NOP;
2061 
2062 #ifdef NVC0_DEBUG_SCHED_DATA
2063    INFO("=== BB:%i initial scores\n", bb->getId());
2064    score->print(cycle);
2065 #endif
2066 
2067    for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
2068       next = insn->next;
2069 
2070       commitInsn(insn, cycle);
2071       int delay = calcDelay(next, cycle);
2072       setDelay(insn, delay, next);
2073       cycle += getCycles(insn, delay);
2074 
2075 #ifdef NVC0_DEBUG_SCHED_DATA
2076       INFO("cycle %i, sched %02x\n", cycle, insn->sched);
2077       insn->print();
2078       next->print();
2079 #endif
2080    }
2081    if (!insn)
2082       return true;
2083    commitInsn(insn, cycle);
2084 
2085    int bbDelay = -1;
2086 
2087    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
2088       BasicBlock *out = BasicBlock::get(ei.getNode());
2089 
2090       if (ei.getType() != Graph::Edge::BACK) {
2091          // only test the first instruction of the outgoing block
2092          next = out->getEntry();
2093          if (next)
2094             bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
2095       } else {
2096          // wait until all dependencies are satisfied
2097          const int regsFree = score->getLatest();
2098          next = out->getFirst();
2099          for (int c = cycle; next && c < regsFree; next = next->next) {
2100             bbDelay = MAX2(bbDelay, calcDelay(next, c));
2101             c += getCycles(next, bbDelay);
2102          }
2103          next = NULL;
2104       }
2105    }
2106    if (bb->cfg.outgoingCount() != 1)
2107       next = NULL;
2108    setDelay(insn, bbDelay, next);
2109    cycle += getCycles(insn, bbDelay);
2110 
2111    score->rebase(cycle); // common base for initializing out blocks' scores
2112    return true;
2113 }
2114 
2115 #define NVE4_MAX_ISSUE_DELAY 0x1f
2116 int
calcDelay(const Instruction * insn,int cycle) const2117 SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
2118 {
2119    int delay = 0, ready = cycle;
2120 
2121    for (int s = 0; insn->srcExists(s); ++s)
2122       checkRd(insn->getSrc(s), cycle, delay);
2123    // WAR & WAW don't seem to matter
2124    // for (int s = 0; insn->srcExists(s); ++s)
2125    //   recordRd(insn->getSrc(s), cycle);
2126 
2127    switch (Target::getOpClass(insn->op)) {
2128    case OPCLASS_SFU:
2129       ready = score->res.sfu;
2130       break;
2131    case OPCLASS_ARITH:
2132       if (insn->op == OP_MUL && !isFloatType(insn->dType))
2133          ready = score->res.imul;
2134       break;
2135    case OPCLASS_TEXTURE:
2136       ready = score->res.tex;
2137       break;
2138    case OPCLASS_LOAD:
2139       ready = score->res.ld[insn->src(0).getFile()];
2140       break;
2141    case OPCLASS_STORE:
2142       ready = score->res.st[insn->src(0).getFile()];
2143       break;
2144    default:
2145       break;
2146    }
2147    if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
2148       ready = MAX2(ready, score->res.tex);
2149 
2150    delay = MAX2(delay, ready - cycle);
2151 
2152    // if can issue next cycle, delay is 0, not 1
2153    return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
2154 }
2155 
2156 void
commitInsn(const Instruction * insn,int cycle)2157 SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
2158 {
2159    const int ready = cycle + targ->getLatency(insn);
2160 
2161    for (int d = 0; insn->defExists(d); ++d)
2162       recordWr(insn->getDef(d), ready);
2163    // WAR & WAW don't seem to matter
2164    // for (int s = 0; insn->srcExists(s); ++s)
2165    //   recordRd(insn->getSrc(s), cycle);
2166 
2167    switch (Target::getOpClass(insn->op)) {
2168    case OPCLASS_SFU:
2169       score->res.sfu = cycle + 4;
2170       break;
2171    case OPCLASS_ARITH:
2172       if (insn->op == OP_MUL && !isFloatType(insn->dType))
2173          score->res.imul = cycle + 4;
2174       break;
2175    case OPCLASS_TEXTURE:
2176       score->res.tex = cycle + 18;
2177       break;
2178    case OPCLASS_LOAD:
2179       if (insn->src(0).getFile() == FILE_MEMORY_CONST)
2180          break;
2181       score->res.ld[insn->src(0).getFile()] = cycle + 4;
2182       score->res.st[insn->src(0).getFile()] = ready;
2183       break;
2184    case OPCLASS_STORE:
2185       score->res.st[insn->src(0).getFile()] = cycle + 4;
2186       score->res.ld[insn->src(0).getFile()] = ready;
2187       break;
2188    case OPCLASS_OTHER:
2189       if (insn->op == OP_TEXBAR)
2190          score->res.tex = cycle;
2191       break;
2192    default:
2193       break;
2194    }
2195 
2196 #ifdef NVC0_DEBUG_SCHED_DATA
2197    score->print(cycle);
2198 #endif
2199 }
2200 
2201 void
checkRd(const Value * v,int cycle,int & delay) const2202 SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
2203 {
2204    int ready = cycle;
2205    int a, b;
2206 
2207    switch (v->reg.file) {
2208    case FILE_GPR:
2209       a = v->reg.data.id;
2210       b = a + v->reg.size / 4;
2211       for (int r = a; r < b; ++r)
2212          ready = MAX2(ready, score->rd.r[r]);
2213       break;
2214    case FILE_PREDICATE:
2215       ready = MAX2(ready, score->rd.p[v->reg.data.id]);
2216       break;
2217    case FILE_FLAGS:
2218       ready = MAX2(ready, score->rd.c);
2219       break;
2220    case FILE_SHADER_INPUT:
2221    case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
2222    case FILE_MEMORY_LOCAL:
2223    case FILE_MEMORY_CONST:
2224    case FILE_MEMORY_SHARED:
2225    case FILE_MEMORY_GLOBAL:
2226    case FILE_SYSTEM_VALUE:
2227       // TODO: any restrictions here ?
2228       break;
2229    case FILE_IMMEDIATE:
2230       break;
2231    default:
2232       assert(0);
2233       break;
2234    }
2235    if (cycle < ready)
2236       delay = MAX2(delay, ready - cycle);
2237 }
2238 
2239 void
checkWr(const Value * v,int cycle,int & delay) const2240 SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
2241 {
2242    int ready = cycle;
2243    int a, b;
2244 
2245    switch (v->reg.file) {
2246    case FILE_GPR:
2247       a = v->reg.data.id;
2248       b = a + v->reg.size / 4;
2249       for (int r = a; r < b; ++r)
2250          ready = MAX2(ready, score->wr.r[r]);
2251       break;
2252    case FILE_PREDICATE:
2253       ready = MAX2(ready, score->wr.p[v->reg.data.id]);
2254       break;
2255    default:
2256       assert(v->reg.file == FILE_FLAGS);
2257       ready = MAX2(ready, score->wr.c);
2258       break;
2259    }
2260    if (cycle < ready)
2261       delay = MAX2(delay, ready - cycle);
2262 }
2263 
2264 void
recordWr(const Value * v,const int ready)2265 SchedDataCalculator::recordWr(const Value *v, const int ready)
2266 {
2267    int a = v->reg.data.id;
2268 
2269    if (v->reg.file == FILE_GPR) {
2270       int b = a + v->reg.size / 4;
2271       for (int r = a; r < b; ++r)
2272          score->rd.r[r] = ready;
2273    } else
2274    // $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
2275    if (v->reg.file == FILE_PREDICATE) {
2276       score->rd.p[a] = ready + 4;
2277    } else {
2278       assert(v->reg.file == FILE_FLAGS);
2279       score->rd.c = ready + 4;
2280    }
2281 }
2282 
2283 void
recordRd(const Value * v,const int ready)2284 SchedDataCalculator::recordRd(const Value *v, const int ready)
2285 {
2286    int a = v->reg.data.id;
2287 
2288    if (v->reg.file == FILE_GPR) {
2289       int b = a + v->reg.size / 4;
2290       for (int r = a; r < b; ++r)
2291          score->wr.r[r] = ready;
2292    } else
2293    if (v->reg.file == FILE_PREDICATE) {
2294       score->wr.p[a] = ready;
2295    } else
2296    if (v->reg.file == FILE_FLAGS) {
2297       score->wr.c = ready;
2298    }
2299 }
2300 
2301 void
prepareEmission(Function * func)2302 CodeEmitterNVC0::prepareEmission(Function *func)
2303 {
2304    const Target *targ = func->getProgram()->getTarget();
2305 
2306    CodeEmitter::prepareEmission(func);
2307 
2308    if (targ->hasSWSched) {
2309       SchedDataCalculator sched(targ);
2310       sched.run(func, true, true);
2311    }
2312 }
2313 
CodeEmitterNVC0(const TargetNVC0 * target)2314 CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
2315    : CodeEmitter(target),
2316      writeIssueDelays(target->hasSWSched)
2317 {
2318    code = NULL;
2319    codeSize = codeSizeLimit = 0;
2320    relocInfo = NULL;
2321 }
2322 
2323 CodeEmitter *
getCodeEmitter(Program::Type type)2324 TargetNVC0::getCodeEmitter(Program::Type type)
2325 {
2326    CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this);
2327    emit->setProgramType(type);
2328    return emit;
2329 }
2330 
2331 } // namespace nv50_ir
2332