1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
23 #include "nv50_ir_target_nvc0.h"
24 
25 namespace nv50_ir {
26 
getTargetNVC0(unsigned int chipset)27 Target *getTargetNVC0(unsigned int chipset)
28 {
29    return new TargetNVC0(chipset);
30 }
31 
TargetNVC0(unsigned int card)32 TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
33 {
34    chipset = card;
35    initOpInfo();
36 }
37 
38 // BULTINS / LIBRARY FUNCTIONS:
39 
40 // lazyness -> will just hardcode everything for the time being
41 
42 // Will probably make this nicer once we support subroutines properly,
43 // i.e. when we have an input IR that provides function declarations.
44 
45 // TODO: separate version for nve4+ which doesn't like the 4-byte insn formats
46 static const uint32_t nvc0_builtin_code[] =
47 {
48 // DIV U32: slow unsigned integer division
49 //
50 // UNR recurrence (q = a / b):
51 // look for z such that 2^32 - b <= b * z < 2^32
52 // then q - 1 <= (a * z) / 2^32 <= q
53 //
54 // INPUT:   $r0: dividend, $r1: divisor
55 // OUTPUT:  $r0: result, $r1: modulus
56 // CLOBBER: $r2 - $r3, $p0 - $p1
57 // SIZE:    22 / 14 * 8 bytes
58 //
59 #if 1
60    0x04009c03, 0x78000000,
61    0x7c209c82, 0x38000000, // 0x7c209cdd,
62    0x0400dde2, 0x18000000, // 0x0010dd18,
63    0x08309c03, 0x60000000,
64    0x05205d04, 0x1c000000, // 0x05605c18,
65    0x0810dc03, 0x50000000, // 0x0810dc2a,
66    0x0c209c43, 0x20040000,
67    0x0810dc03, 0x50000000,
68    0x0c209c43, 0x20040000,
69    0x0810dc03, 0x50000000,
70    0x0c209c43, 0x20040000,
71    0x0810dc03, 0x50000000,
72    0x0c209c43, 0x20040000,
73    0x0810dc03, 0x50000000,
74    0x0c209c43, 0x20040000,
75    0x0000dde4, 0x28000000,
76    0x08001c43, 0x50000000,
77    0x05209d04, 0x1c000000, // 0x05609c18,
78    0x00105c03, 0x20060000, // 0x0010430d,
79    0x0811dc03, 0x1b0e0000,
80    0x08104103, 0x48000000,
81    0x04000002, 0x08000000,
82    0x0811c003, 0x1b0e0000,
83    0x08104103, 0x48000000,
84    0x04000002, 0x08000000, // 0x040000ac,
85    0x00001de7, 0x90000000, // 0x90001dff,
86 #else
87    0x0401dc03, 0x1b0e0000,
88    0x00008003, 0x78000000,
89    0x0400c003, 0x78000000,
90    0x0c20c103, 0x48000000,
91    0x0c108003, 0x60000000,
92    0x00005c28,
93    0x00001d18,
94    0x0031c023, 0x1b0ec000,
95    0xb000a1e7, 0x40000000,
96    0x04000003, 0x6000c000,
97    0x0813dc03, 0x1b000000,
98    0x0420446c,
99    0x040004bd,
100    0x04208003, 0x5800c000,
101    0x0430c103, 0x4800c000,
102    0x0ffc5dff,
103    0x90001dff,
104 #endif
105 
106 // DIV S32: slow signed integer division
107 //
108 // INPUT:   $r0: dividend, $r1: divisor
109 // OUTPUT:  $r0: result, $r1: modulus
110 // CLOBBER: $r2 - $r3, $p0 - $p3
111 // SIZE:    18 * 8 bytes
112 //
113    0xfc05dc23, 0x188e0000,
114    0xfc17dc23, 0x18c40000,
115    0x01201ec4, 0x1c000000, // 0x03301e18,
116    0x05205ec4, 0x1c000000, // 0x07305e18,
117    0x0401dc03, 0x1b0e0000,
118    0x00008003, 0x78000000,
119    0x0400c003, 0x78000000,
120    0x0c20c103, 0x48000000,
121    0x0c108003, 0x60000000,
122    0x00005de4, 0x28000000, // 0x00005c28,
123    0x00001de2, 0x18000000, // 0x00001d18,
124    0x0031c023, 0x1b0ec000,
125    0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000,
126    0x04000003, 0x6000c000,
127    0x0813dc03, 0x1b000000,
128    0x04204603, 0x48000000, // 0x0420446c,
129    0x04000442, 0x38000000, // 0x040004bd,
130    0x04208003, 0x5800c000,
131    0x0430c103, 0x4800c000,
132    0xe0001de7, 0x4003fffe, // 0x0ffc5dff,
133    0x01200f84, 0x1c000000, // 0x01700e18,
134    0x05204b84, 0x1c000000, // 0x05704a18,
135    0x00001de7, 0x90000000, // 0x90001dff,
136 
137 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
138 //
139 // INPUT:   $r0d (x)
140 // OUTPUT:  $r0d (rcp(x))
141 // CLOBBER: $r2 - $r7
142 // SIZE:    9 * 8 bytes
143 //
144    0x9810dc08,
145    0x00009c28,
146    0x4001df18,
147    0x00019d18,
148    0x08011e01, 0x200c0000,
149    0x10209c01, 0x50000000,
150    0x08011e01, 0x200c0000,
151    0x10209c01, 0x50000000,
152    0x08011e01, 0x200c0000,
153    0x10201c01, 0x50000000,
154    0x00001de7, 0x90000000,
155 
156 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
157 //
158 // INPUT:   $r0d (x)
159 // OUTPUT:  $r0d (rsqrt(x))
160 // CLOBBER: $r2 - $r7
161 // SIZE:    14 * 8 bytes
162 //
163    0x9c10dc08,
164    0x00009c28,
165    0x00019d18,
166    0x3fe1df18,
167    0x18001c01, 0x50000000,
168    0x0001dde2, 0x18ffe000,
169    0x08211c01, 0x50000000,
170    0x10011e01, 0x200c0000,
171    0x10209c01, 0x50000000,
172    0x08211c01, 0x50000000,
173    0x10011e01, 0x200c0000,
174    0x10209c01, 0x50000000,
175    0x08211c01, 0x50000000,
176    0x10011e01, 0x200c0000,
177    0x10201c01, 0x50000000,
178    0x00001de7, 0x90000000,
179 };
180 
181 static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
182 {
183    0,
184    8 * (26),
185    8 * (26 + 23),
186    8 * (26 + 23 + 9)
187 };
188 
189 void
getBuiltinCode(const uint32_t ** code,uint32_t * size) const190 TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
191 {
192    *code = &nvc0_builtin_code[0];
193    *size = sizeof(nvc0_builtin_code);
194 }
195 
196 uint32_t
getBuiltinOffset(int builtin) const197 TargetNVC0::getBuiltinOffset(int builtin) const
198 {
199    assert(builtin < NVC0_BUILTIN_COUNT);
200    return nvc0_builtin_offsets[builtin];
201 }
202 
203 struct opProperties
204 {
205    operation op;
206    unsigned int mNeg   : 4;
207    unsigned int mAbs   : 4;
208    unsigned int mNot   : 4;
209    unsigned int mSat   : 4;
210    unsigned int fConst : 3;
211    unsigned int fImmd  : 4; // last bit indicates if full immediate is suppoted
212 };
213 
214 static const struct opProperties _initProps[] =
215 {
216    //           neg  abs  not  sat  c[]  imm
217    { OP_ADD,    0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
218    { OP_SUB,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
219    { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
220    { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
221    { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
222    { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
223    { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
224    { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
225    { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
226    { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
227    { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
228    { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
229    { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
230    { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
231    { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
232    { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
233    { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
234    { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
235    { OP_SLCT,   0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
236    { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
237    { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
238    { OP_COS,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
239    { OP_SIN,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
240    { OP_EX2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
241    { OP_LG2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
242    { OP_RCP,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
243    { OP_RSQ,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
244    { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
245    { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
246    { OP_CALL,   0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
247    { OP_INSBF,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
248    { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
249    { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
250    { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
251    // saturate only:
252    { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
253    { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
254 };
255 
initOpInfo()256 void TargetNVC0::initOpInfo()
257 {
258    unsigned int i, j;
259 
260    static const uint32_t commutative[(OP_LAST + 31) / 32] =
261    {
262       // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
263       0x0670ca00, 0x0000003f, 0x00000000
264    };
265 
266    static const uint32_t shortForm[(OP_LAST + 31) / 32] =
267    {
268       // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
269       0x0670ca00, 0x00000000, 0x00000000
270    };
271 
272    static const operation noDest[] =
273    {
274       OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
275       OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
276       OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
277       OP_QUADON, OP_QUADPOP, OP_TEXBAR
278    };
279 
280    for (i = 0; i < DATA_FILE_COUNT; ++i)
281       nativeFileMap[i] = (DataFile)i;
282    nativeFileMap[FILE_ADDRESS] = FILE_GPR;
283 
284    for (i = 0; i < OP_LAST; ++i) {
285       opInfo[i].variants = NULL;
286       opInfo[i].op = (operation)i;
287       opInfo[i].srcTypes = 1 << (int)TYPE_F32;
288       opInfo[i].dstTypes = 1 << (int)TYPE_F32;
289       opInfo[i].immdBits = 0;
290       opInfo[i].srcNr = operationSrcNr[i];
291 
292       for (j = 0; j < opInfo[i].srcNr; ++j) {
293          opInfo[i].srcMods[j] = 0;
294          opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
295       }
296       opInfo[i].dstMods = 0;
297       opInfo[i].dstFiles = 1 << (int)FILE_GPR;
298 
299       opInfo[i].hasDest = 1;
300       opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
301       opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
302       opInfo[i].pseudo = (i < OP_MOV);
303       opInfo[i].predicate = !opInfo[i].pseudo;
304       opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
305       opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
306    }
307    for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
308       opInfo[noDest[i]].hasDest = 0;
309 
310    for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
311       const struct opProperties *prop = &_initProps[i];
312 
313       for (int s = 0; s < 3; ++s) {
314          if (prop->mNeg & (1 << s))
315             opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
316          if (prop->mAbs & (1 << s))
317             opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
318          if (prop->mNot & (1 << s))
319             opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
320          if (prop->fConst & (1 << s))
321             opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
322          if (prop->fImmd & (1 << s))
323             opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
324          if (prop->fImmd & 8)
325             opInfo[prop->op].immdBits = 0xffffffff;
326       }
327       if (prop->mSat & 8)
328          opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
329    }
330 }
331 
332 unsigned int
getFileSize(DataFile file) const333 TargetNVC0::getFileSize(DataFile file) const
334 {
335    switch (file) {
336    case FILE_NULL:          return 0;
337    case FILE_GPR:           return 63;
338    case FILE_PREDICATE:     return 7;
339    case FILE_FLAGS:         return 1;
340    case FILE_ADDRESS:       return 0;
341    case FILE_IMMEDIATE:     return 0;
342    case FILE_MEMORY_CONST:  return 65536;
343    case FILE_SHADER_INPUT:  return 0x400;
344    case FILE_SHADER_OUTPUT: return 0x400;
345    case FILE_MEMORY_GLOBAL: return 0xffffffff;
346    case FILE_MEMORY_SHARED: return 16 << 10;
347    case FILE_MEMORY_LOCAL:  return 48 << 10;
348    case FILE_SYSTEM_VALUE:  return 32;
349    default:
350       assert(!"invalid file");
351       return 0;
352    }
353 }
354 
355 unsigned int
getFileUnit(DataFile file) const356 TargetNVC0::getFileUnit(DataFile file) const
357 {
358    if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
359       return 2;
360    return 0;
361 }
362 
363 uint32_t
getSVAddress(DataFile shaderFile,const Symbol * sym) const364 TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
365 {
366    const int idx = sym->reg.data.sv.index;
367    const SVSemantic sv = sym->reg.data.sv.sv;
368 
369    const bool isInput = shaderFile == FILE_SHADER_INPUT;
370 
371    switch (sv) {
372    case SV_POSITION:       return 0x070 + idx * 4;
373    case SV_INSTANCE_ID:    return 0x2f8;
374    case SV_VERTEX_ID:      return 0x2fc;
375    case SV_PRIMITIVE_ID:   return isInput ? 0x060 : 0x040;
376    case SV_LAYER:          return 0x064;
377    case SV_VIEWPORT_INDEX: return 0x068;
378    case SV_POINT_SIZE:     return 0x06c;
379    case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
380    case SV_POINT_COORD:    return 0x2e0 + idx * 4;
381    case SV_FACE:           return 0x3fc;
382    case SV_TESS_FACTOR:    return 0x000 + idx * 4;
383    case SV_TESS_COORD:     return 0x2f0 + idx * 4;
384    default:
385       return 0xffffffff;
386    }
387 }
388 
389 bool
insnCanLoad(const Instruction * i,int s,const Instruction * ld) const390 TargetNVC0::insnCanLoad(const Instruction *i, int s,
391                         const Instruction *ld) const
392 {
393    DataFile sf = ld->src(0).getFile();
394 
395    // immediate 0 can be represented by GPR $r63
396    if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
397       return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE);
398 
399    if (s >= opInfo[i->op].srcNr)
400       return false;
401    if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
402       return false;
403 
404    // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
405    if (ld->src(0).isIndirect(0))
406       return false;
407 
408    for (int k = 0; i->srcExists(k); ++k) {
409       if (i->src(k).getFile() == FILE_IMMEDIATE) {
410          if (i->getSrc(k)->reg.data.u64 != 0)
411             return false;
412       } else
413       if (i->src(k).getFile() != FILE_GPR &&
414           i->src(k).getFile() != FILE_PREDICATE) {
415          return false;
416       }
417    }
418 
419    // not all instructions support full 32 bit immediates
420    if (sf == FILE_IMMEDIATE) {
421       Storage &reg = ld->getSrc(0)->asImm()->reg;
422 
423       if (opInfo[i->op].immdBits != 0xffffffff) {
424          if (i->sType == TYPE_F32) {
425             if (reg.data.u32 & 0xfff)
426                return false;
427          } else
428          if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
429             // with u32, 0xfffff counts as 0xffffffff as well
430             if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
431                return false;
432          }
433       } else
434       if (i->op == OP_MAD || i->op == OP_FMA) {
435          // requires src == dst, cannot decide before RA
436          // (except if we implement more constraints)
437          if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
438             return false;
439       }
440    }
441 
442    return true;
443 }
444 
445 bool
isAccessSupported(DataFile file,DataType ty) const446 TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
447 {
448    if (ty == TYPE_NONE)
449       return false;
450    if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
451       return typeSizeof(ty) <= 8;
452    if (ty == TYPE_B96)
453       return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT);
454    return true;
455 }
456 
457 bool
isOpSupported(operation op,DataType ty) const458 TargetNVC0::isOpSupported(operation op, DataType ty) const
459 {
460    if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
461       return false;
462    if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
463       return false;
464    if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
465       return false;
466    return true;
467 }
468 
469 bool
isModSupported(const Instruction * insn,int s,Modifier mod) const470 TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
471 {
472    if (!isFloatType(insn->dType)) {
473       switch (insn->op) {
474       case OP_ABS:
475       case OP_NEG:
476       case OP_CVT:
477       case OP_CEIL:
478       case OP_FLOOR:
479       case OP_TRUNC:
480       case OP_AND:
481       case OP_OR:
482       case OP_XOR:
483          break;
484       case OP_ADD:
485          if (mod.abs())
486             return false;
487          if (insn->src(s ? 0 : 1).mod.neg())
488             return false;
489          break;
490       case OP_SUB:
491          if (s == 0)
492             return insn->src(1).mod.neg() ? false : true;
493          break;
494       default:
495          return false;
496       }
497    }
498    if (s > 3)
499       return false;
500    return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
501 }
502 
503 bool
mayPredicate(const Instruction * insn,const Value * pred) const504 TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
505 {
506    if (insn->getPredicate())
507       return false;
508    return opInfo[insn->op].predicate;
509 }
510 
511 bool
isSatSupported(const Instruction * insn) const512 TargetNVC0::isSatSupported(const Instruction *insn) const
513 {
514    if (insn->op == OP_CVT)
515       return true;
516    if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
517       return false;
518 
519    if (insn->dType == TYPE_U32)
520       return (insn->op == OP_ADD) || (insn->op == OP_MAD);
521 
522    return insn->dType == TYPE_F32;
523 }
524 
525 bool
isPostMultiplySupported(operation op,float f,int & e) const526 TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
527 {
528    if (op != OP_MUL)
529       return false;
530    f = fabsf(f);
531    e = static_cast<int>(log2f(f));
532    if (e < -3 || e > 3)
533       return false;
534    return f == exp2f(static_cast<float>(e));
535 }
536 
537 // TODO: better values
538 // this could be more precise, e.g. depending on the issue-to-read/write delay
539 // of the depending instruction, but it's good enough
getLatency(const Instruction * i) const540 int TargetNVC0::getLatency(const Instruction *i) const
541 {
542    if (chipset >= 0xe4) {
543       if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
544          return 20;
545       switch (i->op) {
546       case OP_LINTERP:
547       case OP_PINTERP:
548          return 15;
549       case OP_LOAD:
550          if (i->src(0).getFile() == FILE_MEMORY_CONST)
551             return 9;
552          // fall through
553       case OP_VFETCH:
554          return 24;
555       default:
556          if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
557             return 17;
558          if (i->op == OP_MUL && i->dType != TYPE_F32)
559             return 15;
560          return 9;
561       }
562    } else {
563       if (i->op == OP_LOAD) {
564          if (i->cache == CACHE_CV)
565             return 700;
566          return 48;
567       }
568       return 24;
569    }
570    return 32;
571 }
572 
573 // These are "inverse" throughput values, i.e. the number of cycles required
574 // to issue a specific instruction for a full warp (32 threads).
575 //
576 // Assuming we have more than 1 warp in flight, a higher issue latency results
577 // in a lower result latency since the MP will have spent more time with other
578 // warps.
579 // This also helps to determine the number of cycles between instructions in
580 // a single warp.
581 //
getThroughput(const Instruction * i) const582 int TargetNVC0::getThroughput(const Instruction *i) const
583 {
584    // TODO: better values
585    if (i->dType == TYPE_F32) {
586       switch (i->op) {
587       case OP_ADD:
588       case OP_MUL:
589       case OP_MAD:
590       case OP_FMA:
591          return 1;
592       case OP_CVT:
593       case OP_CEIL:
594       case OP_FLOOR:
595       case OP_TRUNC:
596       case OP_SET:
597       case OP_SLCT:
598       case OP_MIN:
599       case OP_MAX:
600          return 2;
601       case OP_RCP:
602       case OP_RSQ:
603       case OP_LG2:
604       case OP_SIN:
605       case OP_COS:
606       case OP_PRESIN:
607       case OP_PREEX2:
608       default:
609          return 8;
610       }
611    } else
612    if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
613       switch (i->op) {
614       case OP_ADD:
615       case OP_AND:
616       case OP_OR:
617       case OP_XOR:
618       case OP_NOT:
619          return 1;
620       case OP_MUL:
621       case OP_MAD:
622       case OP_CVT:
623       case OP_SET:
624       case OP_SLCT:
625       case OP_SHL:
626       case OP_SHR:
627       case OP_NEG:
628       case OP_ABS:
629       case OP_MIN:
630       case OP_MAX:
631       default:
632          return 2;
633       }
634    } else
635    if (i->dType == TYPE_F64) {
636       return 2;
637    } else {
638       return 1;
639    }
640 }
641 
canDualIssue(const Instruction * a,const Instruction * b) const642 bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
643 {
644    const OpClass clA = operationClass[a->op];
645    const OpClass clB = operationClass[b->op];
646 
647    if (getChipset() >= 0xe4) {
648       // not texturing
649       // not if the 2nd instruction isn't necessarily executed
650       if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
651          return false;
652       // anything with MOV
653       if (a->op == OP_MOV || b->op == OP_MOV)
654          return true;
655       if (clA == clB) {
656          // only F32 arith or integer additions
657          if (clA != OPCLASS_ARITH)
658             return false;
659          return (a->dType == TYPE_F32 || a->op == OP_ADD ||
660                  b->dType == TYPE_F32 || b->op == OP_ADD);
661       }
662       // nothing with TEXBAR
663       if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
664          return false;
665       // no loads and stores accessing the the same space
666       if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
667           (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
668          if (a->src(0).getFile() == b->src(0).getFile())
669             return false;
670       // no > 32-bit ops
671       if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
672           typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
673          return false;
674       return true;
675    } else {
676       return false; // info not needed (yet)
677    }
678 }
679 
680 } // namespace nv50_ir
681