1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50_ir_target_nvc0.h"
24
25 namespace nv50_ir {
26
getTargetNVC0(unsigned int chipset)27 Target *getTargetNVC0(unsigned int chipset)
28 {
29 return new TargetNVC0(chipset);
30 }
31
TargetNVC0(unsigned int card)32 TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
33 {
34 chipset = card;
35 initOpInfo();
36 }
37
38 // BULTINS / LIBRARY FUNCTIONS:
39
40 // lazyness -> will just hardcode everything for the time being
41
42 // Will probably make this nicer once we support subroutines properly,
43 // i.e. when we have an input IR that provides function declarations.
44
45 // TODO: separate version for nve4+ which doesn't like the 4-byte insn formats
46 static const uint32_t nvc0_builtin_code[] =
47 {
48 // DIV U32: slow unsigned integer division
49 //
50 // UNR recurrence (q = a / b):
51 // look for z such that 2^32 - b <= b * z < 2^32
52 // then q - 1 <= (a * z) / 2^32 <= q
53 //
54 // INPUT: $r0: dividend, $r1: divisor
55 // OUTPUT: $r0: result, $r1: modulus
56 // CLOBBER: $r2 - $r3, $p0 - $p1
57 // SIZE: 22 / 14 * 8 bytes
58 //
59 #if 1
60 0x04009c03, 0x78000000,
61 0x7c209c82, 0x38000000, // 0x7c209cdd,
62 0x0400dde2, 0x18000000, // 0x0010dd18,
63 0x08309c03, 0x60000000,
64 0x05205d04, 0x1c000000, // 0x05605c18,
65 0x0810dc03, 0x50000000, // 0x0810dc2a,
66 0x0c209c43, 0x20040000,
67 0x0810dc03, 0x50000000,
68 0x0c209c43, 0x20040000,
69 0x0810dc03, 0x50000000,
70 0x0c209c43, 0x20040000,
71 0x0810dc03, 0x50000000,
72 0x0c209c43, 0x20040000,
73 0x0810dc03, 0x50000000,
74 0x0c209c43, 0x20040000,
75 0x0000dde4, 0x28000000,
76 0x08001c43, 0x50000000,
77 0x05209d04, 0x1c000000, // 0x05609c18,
78 0x00105c03, 0x20060000, // 0x0010430d,
79 0x0811dc03, 0x1b0e0000,
80 0x08104103, 0x48000000,
81 0x04000002, 0x08000000,
82 0x0811c003, 0x1b0e0000,
83 0x08104103, 0x48000000,
84 0x04000002, 0x08000000, // 0x040000ac,
85 0x00001de7, 0x90000000, // 0x90001dff,
86 #else
87 0x0401dc03, 0x1b0e0000,
88 0x00008003, 0x78000000,
89 0x0400c003, 0x78000000,
90 0x0c20c103, 0x48000000,
91 0x0c108003, 0x60000000,
92 0x00005c28,
93 0x00001d18,
94 0x0031c023, 0x1b0ec000,
95 0xb000a1e7, 0x40000000,
96 0x04000003, 0x6000c000,
97 0x0813dc03, 0x1b000000,
98 0x0420446c,
99 0x040004bd,
100 0x04208003, 0x5800c000,
101 0x0430c103, 0x4800c000,
102 0x0ffc5dff,
103 0x90001dff,
104 #endif
105
106 // DIV S32: slow signed integer division
107 //
108 // INPUT: $r0: dividend, $r1: divisor
109 // OUTPUT: $r0: result, $r1: modulus
110 // CLOBBER: $r2 - $r3, $p0 - $p3
111 // SIZE: 18 * 8 bytes
112 //
113 0xfc05dc23, 0x188e0000,
114 0xfc17dc23, 0x18c40000,
115 0x01201ec4, 0x1c000000, // 0x03301e18,
116 0x05205ec4, 0x1c000000, // 0x07305e18,
117 0x0401dc03, 0x1b0e0000,
118 0x00008003, 0x78000000,
119 0x0400c003, 0x78000000,
120 0x0c20c103, 0x48000000,
121 0x0c108003, 0x60000000,
122 0x00005de4, 0x28000000, // 0x00005c28,
123 0x00001de2, 0x18000000, // 0x00001d18,
124 0x0031c023, 0x1b0ec000,
125 0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000,
126 0x04000003, 0x6000c000,
127 0x0813dc03, 0x1b000000,
128 0x04204603, 0x48000000, // 0x0420446c,
129 0x04000442, 0x38000000, // 0x040004bd,
130 0x04208003, 0x5800c000,
131 0x0430c103, 0x4800c000,
132 0xe0001de7, 0x4003fffe, // 0x0ffc5dff,
133 0x01200f84, 0x1c000000, // 0x01700e18,
134 0x05204b84, 0x1c000000, // 0x05704a18,
135 0x00001de7, 0x90000000, // 0x90001dff,
136
137 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
138 //
139 // INPUT: $r0d (x)
140 // OUTPUT: $r0d (rcp(x))
141 // CLOBBER: $r2 - $r7
142 // SIZE: 9 * 8 bytes
143 //
144 0x9810dc08,
145 0x00009c28,
146 0x4001df18,
147 0x00019d18,
148 0x08011e01, 0x200c0000,
149 0x10209c01, 0x50000000,
150 0x08011e01, 0x200c0000,
151 0x10209c01, 0x50000000,
152 0x08011e01, 0x200c0000,
153 0x10201c01, 0x50000000,
154 0x00001de7, 0x90000000,
155
156 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
157 //
158 // INPUT: $r0d (x)
159 // OUTPUT: $r0d (rsqrt(x))
160 // CLOBBER: $r2 - $r7
161 // SIZE: 14 * 8 bytes
162 //
163 0x9c10dc08,
164 0x00009c28,
165 0x00019d18,
166 0x3fe1df18,
167 0x18001c01, 0x50000000,
168 0x0001dde2, 0x18ffe000,
169 0x08211c01, 0x50000000,
170 0x10011e01, 0x200c0000,
171 0x10209c01, 0x50000000,
172 0x08211c01, 0x50000000,
173 0x10011e01, 0x200c0000,
174 0x10209c01, 0x50000000,
175 0x08211c01, 0x50000000,
176 0x10011e01, 0x200c0000,
177 0x10201c01, 0x50000000,
178 0x00001de7, 0x90000000,
179 };
180
181 static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
182 {
183 0,
184 8 * (26),
185 8 * (26 + 23),
186 8 * (26 + 23 + 9)
187 };
188
189 void
getBuiltinCode(const uint32_t ** code,uint32_t * size) const190 TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
191 {
192 *code = &nvc0_builtin_code[0];
193 *size = sizeof(nvc0_builtin_code);
194 }
195
196 uint32_t
getBuiltinOffset(int builtin) const197 TargetNVC0::getBuiltinOffset(int builtin) const
198 {
199 assert(builtin < NVC0_BUILTIN_COUNT);
200 return nvc0_builtin_offsets[builtin];
201 }
202
203 struct opProperties
204 {
205 operation op;
206 unsigned int mNeg : 4;
207 unsigned int mAbs : 4;
208 unsigned int mNot : 4;
209 unsigned int mSat : 4;
210 unsigned int fConst : 3;
211 unsigned int fImmd : 4; // last bit indicates if full immediate is suppoted
212 };
213
214 static const struct opProperties _initProps[] =
215 {
216 // neg abs not sat c[] imm
217 { OP_ADD, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
218 { OP_SUB, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
219 { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
220 { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
221 { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
222 { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
223 { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
224 { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
225 { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
226 { OP_CEIL, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
227 { OP_FLOOR, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
228 { OP_TRUNC, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
229 { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
230 { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
231 { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
232 { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
233 { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
234 { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
235 { OP_SLCT, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
236 { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
237 { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
238 { OP_COS, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
239 { OP_SIN, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
240 { OP_EX2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
241 { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
242 { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
243 { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
244 { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
245 { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
246 { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
247 { OP_INSBF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
248 { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
249 { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
250 { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
251 // saturate only:
252 { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
253 { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
254 };
255
initOpInfo()256 void TargetNVC0::initOpInfo()
257 {
258 unsigned int i, j;
259
260 static const uint32_t commutative[(OP_LAST + 31) / 32] =
261 {
262 // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
263 0x0670ca00, 0x0000003f, 0x00000000
264 };
265
266 static const uint32_t shortForm[(OP_LAST + 31) / 32] =
267 {
268 // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
269 0x0670ca00, 0x00000000, 0x00000000
270 };
271
272 static const operation noDest[] =
273 {
274 OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
275 OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
276 OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
277 OP_QUADON, OP_QUADPOP, OP_TEXBAR
278 };
279
280 for (i = 0; i < DATA_FILE_COUNT; ++i)
281 nativeFileMap[i] = (DataFile)i;
282 nativeFileMap[FILE_ADDRESS] = FILE_GPR;
283
284 for (i = 0; i < OP_LAST; ++i) {
285 opInfo[i].variants = NULL;
286 opInfo[i].op = (operation)i;
287 opInfo[i].srcTypes = 1 << (int)TYPE_F32;
288 opInfo[i].dstTypes = 1 << (int)TYPE_F32;
289 opInfo[i].immdBits = 0;
290 opInfo[i].srcNr = operationSrcNr[i];
291
292 for (j = 0; j < opInfo[i].srcNr; ++j) {
293 opInfo[i].srcMods[j] = 0;
294 opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
295 }
296 opInfo[i].dstMods = 0;
297 opInfo[i].dstFiles = 1 << (int)FILE_GPR;
298
299 opInfo[i].hasDest = 1;
300 opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
301 opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
302 opInfo[i].pseudo = (i < OP_MOV);
303 opInfo[i].predicate = !opInfo[i].pseudo;
304 opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
305 opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
306 }
307 for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
308 opInfo[noDest[i]].hasDest = 0;
309
310 for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
311 const struct opProperties *prop = &_initProps[i];
312
313 for (int s = 0; s < 3; ++s) {
314 if (prop->mNeg & (1 << s))
315 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
316 if (prop->mAbs & (1 << s))
317 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
318 if (prop->mNot & (1 << s))
319 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
320 if (prop->fConst & (1 << s))
321 opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
322 if (prop->fImmd & (1 << s))
323 opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
324 if (prop->fImmd & 8)
325 opInfo[prop->op].immdBits = 0xffffffff;
326 }
327 if (prop->mSat & 8)
328 opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
329 }
330 }
331
332 unsigned int
getFileSize(DataFile file) const333 TargetNVC0::getFileSize(DataFile file) const
334 {
335 switch (file) {
336 case FILE_NULL: return 0;
337 case FILE_GPR: return 63;
338 case FILE_PREDICATE: return 7;
339 case FILE_FLAGS: return 1;
340 case FILE_ADDRESS: return 0;
341 case FILE_IMMEDIATE: return 0;
342 case FILE_MEMORY_CONST: return 65536;
343 case FILE_SHADER_INPUT: return 0x400;
344 case FILE_SHADER_OUTPUT: return 0x400;
345 case FILE_MEMORY_GLOBAL: return 0xffffffff;
346 case FILE_MEMORY_SHARED: return 16 << 10;
347 case FILE_MEMORY_LOCAL: return 48 << 10;
348 case FILE_SYSTEM_VALUE: return 32;
349 default:
350 assert(!"invalid file");
351 return 0;
352 }
353 }
354
355 unsigned int
getFileUnit(DataFile file) const356 TargetNVC0::getFileUnit(DataFile file) const
357 {
358 if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
359 return 2;
360 return 0;
361 }
362
363 uint32_t
getSVAddress(DataFile shaderFile,const Symbol * sym) const364 TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
365 {
366 const int idx = sym->reg.data.sv.index;
367 const SVSemantic sv = sym->reg.data.sv.sv;
368
369 const bool isInput = shaderFile == FILE_SHADER_INPUT;
370
371 switch (sv) {
372 case SV_POSITION: return 0x070 + idx * 4;
373 case SV_INSTANCE_ID: return 0x2f8;
374 case SV_VERTEX_ID: return 0x2fc;
375 case SV_PRIMITIVE_ID: return isInput ? 0x060 : 0x040;
376 case SV_LAYER: return 0x064;
377 case SV_VIEWPORT_INDEX: return 0x068;
378 case SV_POINT_SIZE: return 0x06c;
379 case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4;
380 case SV_POINT_COORD: return 0x2e0 + idx * 4;
381 case SV_FACE: return 0x3fc;
382 case SV_TESS_FACTOR: return 0x000 + idx * 4;
383 case SV_TESS_COORD: return 0x2f0 + idx * 4;
384 default:
385 return 0xffffffff;
386 }
387 }
388
389 bool
insnCanLoad(const Instruction * i,int s,const Instruction * ld) const390 TargetNVC0::insnCanLoad(const Instruction *i, int s,
391 const Instruction *ld) const
392 {
393 DataFile sf = ld->src(0).getFile();
394
395 // immediate 0 can be represented by GPR $r63
396 if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
397 return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE);
398
399 if (s >= opInfo[i->op].srcNr)
400 return false;
401 if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
402 return false;
403
404 // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
405 if (ld->src(0).isIndirect(0))
406 return false;
407
408 for (int k = 0; i->srcExists(k); ++k) {
409 if (i->src(k).getFile() == FILE_IMMEDIATE) {
410 if (i->getSrc(k)->reg.data.u64 != 0)
411 return false;
412 } else
413 if (i->src(k).getFile() != FILE_GPR &&
414 i->src(k).getFile() != FILE_PREDICATE) {
415 return false;
416 }
417 }
418
419 // not all instructions support full 32 bit immediates
420 if (sf == FILE_IMMEDIATE) {
421 Storage ® = ld->getSrc(0)->asImm()->reg;
422
423 if (opInfo[i->op].immdBits != 0xffffffff) {
424 if (i->sType == TYPE_F32) {
425 if (reg.data.u32 & 0xfff)
426 return false;
427 } else
428 if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
429 // with u32, 0xfffff counts as 0xffffffff as well
430 if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
431 return false;
432 }
433 } else
434 if (i->op == OP_MAD || i->op == OP_FMA) {
435 // requires src == dst, cannot decide before RA
436 // (except if we implement more constraints)
437 if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
438 return false;
439 }
440 }
441
442 return true;
443 }
444
445 bool
isAccessSupported(DataFile file,DataType ty) const446 TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
447 {
448 if (ty == TYPE_NONE)
449 return false;
450 if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
451 return typeSizeof(ty) <= 8;
452 if (ty == TYPE_B96)
453 return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT);
454 return true;
455 }
456
457 bool
isOpSupported(operation op,DataType ty) const458 TargetNVC0::isOpSupported(operation op, DataType ty) const
459 {
460 if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
461 return false;
462 if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
463 return false;
464 if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
465 return false;
466 return true;
467 }
468
469 bool
isModSupported(const Instruction * insn,int s,Modifier mod) const470 TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
471 {
472 if (!isFloatType(insn->dType)) {
473 switch (insn->op) {
474 case OP_ABS:
475 case OP_NEG:
476 case OP_CVT:
477 case OP_CEIL:
478 case OP_FLOOR:
479 case OP_TRUNC:
480 case OP_AND:
481 case OP_OR:
482 case OP_XOR:
483 break;
484 case OP_ADD:
485 if (mod.abs())
486 return false;
487 if (insn->src(s ? 0 : 1).mod.neg())
488 return false;
489 break;
490 case OP_SUB:
491 if (s == 0)
492 return insn->src(1).mod.neg() ? false : true;
493 break;
494 default:
495 return false;
496 }
497 }
498 if (s > 3)
499 return false;
500 return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
501 }
502
503 bool
mayPredicate(const Instruction * insn,const Value * pred) const504 TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
505 {
506 if (insn->getPredicate())
507 return false;
508 return opInfo[insn->op].predicate;
509 }
510
511 bool
isSatSupported(const Instruction * insn) const512 TargetNVC0::isSatSupported(const Instruction *insn) const
513 {
514 if (insn->op == OP_CVT)
515 return true;
516 if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
517 return false;
518
519 if (insn->dType == TYPE_U32)
520 return (insn->op == OP_ADD) || (insn->op == OP_MAD);
521
522 return insn->dType == TYPE_F32;
523 }
524
525 bool
isPostMultiplySupported(operation op,float f,int & e) const526 TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
527 {
528 if (op != OP_MUL)
529 return false;
530 f = fabsf(f);
531 e = static_cast<int>(log2f(f));
532 if (e < -3 || e > 3)
533 return false;
534 return f == exp2f(static_cast<float>(e));
535 }
536
537 // TODO: better values
538 // this could be more precise, e.g. depending on the issue-to-read/write delay
539 // of the depending instruction, but it's good enough
getLatency(const Instruction * i) const540 int TargetNVC0::getLatency(const Instruction *i) const
541 {
542 if (chipset >= 0xe4) {
543 if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
544 return 20;
545 switch (i->op) {
546 case OP_LINTERP:
547 case OP_PINTERP:
548 return 15;
549 case OP_LOAD:
550 if (i->src(0).getFile() == FILE_MEMORY_CONST)
551 return 9;
552 // fall through
553 case OP_VFETCH:
554 return 24;
555 default:
556 if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
557 return 17;
558 if (i->op == OP_MUL && i->dType != TYPE_F32)
559 return 15;
560 return 9;
561 }
562 } else {
563 if (i->op == OP_LOAD) {
564 if (i->cache == CACHE_CV)
565 return 700;
566 return 48;
567 }
568 return 24;
569 }
570 return 32;
571 }
572
573 // These are "inverse" throughput values, i.e. the number of cycles required
574 // to issue a specific instruction for a full warp (32 threads).
575 //
576 // Assuming we have more than 1 warp in flight, a higher issue latency results
577 // in a lower result latency since the MP will have spent more time with other
578 // warps.
579 // This also helps to determine the number of cycles between instructions in
580 // a single warp.
581 //
getThroughput(const Instruction * i) const582 int TargetNVC0::getThroughput(const Instruction *i) const
583 {
584 // TODO: better values
585 if (i->dType == TYPE_F32) {
586 switch (i->op) {
587 case OP_ADD:
588 case OP_MUL:
589 case OP_MAD:
590 case OP_FMA:
591 return 1;
592 case OP_CVT:
593 case OP_CEIL:
594 case OP_FLOOR:
595 case OP_TRUNC:
596 case OP_SET:
597 case OP_SLCT:
598 case OP_MIN:
599 case OP_MAX:
600 return 2;
601 case OP_RCP:
602 case OP_RSQ:
603 case OP_LG2:
604 case OP_SIN:
605 case OP_COS:
606 case OP_PRESIN:
607 case OP_PREEX2:
608 default:
609 return 8;
610 }
611 } else
612 if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
613 switch (i->op) {
614 case OP_ADD:
615 case OP_AND:
616 case OP_OR:
617 case OP_XOR:
618 case OP_NOT:
619 return 1;
620 case OP_MUL:
621 case OP_MAD:
622 case OP_CVT:
623 case OP_SET:
624 case OP_SLCT:
625 case OP_SHL:
626 case OP_SHR:
627 case OP_NEG:
628 case OP_ABS:
629 case OP_MIN:
630 case OP_MAX:
631 default:
632 return 2;
633 }
634 } else
635 if (i->dType == TYPE_F64) {
636 return 2;
637 } else {
638 return 1;
639 }
640 }
641
canDualIssue(const Instruction * a,const Instruction * b) const642 bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
643 {
644 const OpClass clA = operationClass[a->op];
645 const OpClass clB = operationClass[b->op];
646
647 if (getChipset() >= 0xe4) {
648 // not texturing
649 // not if the 2nd instruction isn't necessarily executed
650 if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
651 return false;
652 // anything with MOV
653 if (a->op == OP_MOV || b->op == OP_MOV)
654 return true;
655 if (clA == clB) {
656 // only F32 arith or integer additions
657 if (clA != OPCLASS_ARITH)
658 return false;
659 return (a->dType == TYPE_F32 || a->op == OP_ADD ||
660 b->dType == TYPE_F32 || b->op == OP_ADD);
661 }
662 // nothing with TEXBAR
663 if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
664 return false;
665 // no loads and stores accessing the the same space
666 if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
667 (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
668 if (a->src(0).getFile() == b->src(0).getFile())
669 return false;
670 // no > 32-bit ops
671 if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
672 typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
673 return false;
674 return true;
675 } else {
676 return false; // info not needed (yet)
677 }
678 }
679
680 } // namespace nv50_ir
681