1 /*
2  * Copyright 2011 Christoph Bumiller
3  *           2014 Red Hat Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "codegen/nv50_ir.h"
25 #include "codegen/nv50_ir_build_util.h"
26 
27 #include "codegen/nv50_ir_target_nvc0.h"
28 #include "codegen/nv50_ir_lowering_gm107.h"
29 
30 #include <limits>
31 
32 namespace nv50_ir {
33 
34 #define QOP_ADD  0
35 #define QOP_SUBR 1
36 #define QOP_SUB  2
37 #define QOP_MOV2 3
38 
39 //             UL UR LL LR
40 #define QUADOP(q, r, s, t)                      \
41    ((QOP_##q << 6) | (QOP_##r << 4) |           \
42     (QOP_##s << 2) | (QOP_##t << 0))
43 
44 #define SHFL_BOUND_QUAD 0x1c03
45 
46 void
handlePFETCH(Instruction * i)47 GM107LegalizeSSA::handlePFETCH(Instruction *i)
48 {
49    Value *src0;
50 
51    if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))
52       return;
53 
54    bld.setPosition(i, false);
55    src0 = bld.getSSA();
56 
57    if (i->srcExists(1))
58       bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));
59    else
60       bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));
61 
62    i->setSrc(0, src0);
63    i->setSrc(1, NULL);
64 }
65 
66 void
handleLOAD(Instruction * i)67 GM107LegalizeSSA::handleLOAD(Instruction *i)
68 {
69    if (i->src(0).getFile() != FILE_MEMORY_CONST)
70       return;
71    if (i->src(0).isIndirect(0))
72       return;
73    if (typeSizeof(i->dType) != 4)
74       return;
75 
76    i->op = OP_MOV;
77 }
78 
79 bool
visit(Instruction * i)80 GM107LegalizeSSA::visit(Instruction *i)
81 {
82    switch (i->op) {
83    case OP_PFETCH:
84       handlePFETCH(i);
85       break;
86    case OP_LOAD:
87       handleLOAD(i);
88       break;
89    default:
90       break;
91    }
92    return true;
93 }
94 
95 bool
handleManualTXD(TexInstruction * i)96 GM107LoweringPass::handleManualTXD(TexInstruction *i)
97 {
98    // See NVC0LoweringPass::handleManualTXD for rationale. This function
99    // implements the same logic, but using SM50-friendly primitives.
100    static const uint8_t qOps[2] =
101       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
102    Value *def[4][4];
103    Value *crd[3], *arr, *shadow;
104    Value *tmp;
105    Instruction *tex, *add;
106    Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
107    int l, c;
108    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
109    const int array = i->tex.target.isArray();
110    const int indirect = i->tex.rIndirectSrc >= 0;
111 
112    i->op = OP_TEX; // no need to clone dPdx/dPdy later
113 
114    for (c = 0; c < dim; ++c)
115       crd[c] = bld.getScratch();
116    arr = bld.getScratch();
117    shadow = bld.getScratch();
118    tmp = bld.getScratch();
119 
120    for (l = 0; l < 4; ++l) {
121       Value *src[3], *val;
122       Value *lane = bld.mkImm(l);
123       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
124       // Make sure lane 0 has the appropriate array/depth compare values
125       if (l != 0) {
126          if (array)
127             bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);
128          if (i->tex.target.isShadow())
129             bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad);
130       }
131 
132       // mov coordinates from lane l to all lanes
133       for (c = 0; c < dim; ++c) {
134          bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad);
135       }
136 
137       // add dPdx from lane l to lanes dx
138       for (c = 0; c < dim; ++c) {
139          bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);
140          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
141          add->subOp = qOps[0];
142          add->lanes = 1; /* abused for .ndv */
143       }
144 
145       // add dPdy from lane l to lanes dy
146       for (c = 0; c < dim; ++c) {
147          bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);
148          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
149          add->subOp = qOps[1];
150          add->lanes = 1; /* abused for .ndv */
151       }
152 
153       // normalize cube coordinates if necessary
154       if (i->tex.target.isCube()) {
155          for (c = 0; c < 3; ++c)
156             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
157          val = bld.getScratch();
158          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
159          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
160          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
161          for (c = 0; c < 3; ++c)
162             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
163       } else {
164          for (c = 0; c < dim; ++c)
165             src[c] = crd[c];
166       }
167 
168       // texture
169       bld.insert(tex = cloneForward(func, i));
170       if (l != 0) {
171          if (array)
172             tex->setSrc(0, arr);
173          if (i->tex.target.isShadow())
174             tex->setSrc(array + dim + indirect, shadow);
175       }
176       for (c = 0; c < dim; ++c)
177          tex->setSrc(c + array, src[c]);
178       // broadcast results from lane 0 to all lanes
179       if (l != 0)
180          for (c = 0; i->defExists(c); ++c)
181             bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad);
182       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
183 
184       // save results
185       for (c = 0; i->defExists(c); ++c) {
186          Instruction *mov;
187          def[c][l] = bld.getSSA();
188          mov = bld.mkMov(def[c][l], tex->getDef(c));
189          mov->fixed = 1;
190          mov->lanes = 1 << l;
191       }
192    }
193 
194    for (c = 0; i->defExists(c); ++c) {
195       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
196       for (l = 0; l < 4; ++l)
197          u->setSrc(l, def[c][l]);
198    }
199 
200    i->bb->remove(i);
201    return true;
202 }
203 
204 bool
handleDFDX(Instruction * insn)205 GM107LoweringPass::handleDFDX(Instruction *insn)
206 {
207    Instruction *shfl;
208    int qop = 0, xid = 0;
209 
210    switch (insn->op) {
211    case OP_DFDX:
212       qop = QUADOP(SUB, SUBR, SUB, SUBR);
213       xid = 1;
214       break;
215    case OP_DFDY:
216       qop = QUADOP(SUB, SUB, SUBR, SUBR);
217       xid = 2;
218       break;
219    default:
220       assert(!"invalid dfdx opcode");
221       break;
222    }
223 
224    shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0),
225                     bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD));
226    shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
227    insn->op = OP_QUADOP;
228    insn->subOp = qop;
229    insn->lanes = 0; /* abused for !.ndv */
230    insn->setSrc(1, insn->getSrc(0));
231    insn->setSrc(0, shfl->getDef(0));
232    return true;
233 }
234 
235 bool
handlePFETCH(Instruction * i)236 GM107LoweringPass::handlePFETCH(Instruction *i)
237 {
238    Value *tmp0 = bld.getScratch();
239    Value *tmp1 = bld.getScratch();
240    Value *tmp2 = bld.getScratch();
241    bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
242    bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16));
243    bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff));
244    bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff));
245    if (i->getSrc(1))
246       bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
247    else
248       bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));
249    bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
250    i->setSrc(0, tmp0);
251    i->setSrc(1, NULL);
252    return true;
253 }
254 
255 bool
handlePOPCNT(Instruction * i)256 GM107LoweringPass::handlePOPCNT(Instruction *i)
257 {
258    Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
259                            i->getSrc(0), i->getSrc(1));
260    i->setSrc(0, tmp);
261    i->setSrc(1, NULL);
262    return true;
263 }
264 
265 //
266 // - add quadop dance for texturing
267 // - put FP outputs in GPRs
268 // - convert instruction sequences
269 //
270 bool
visit(Instruction * i)271 GM107LoweringPass::visit(Instruction *i)
272 {
273    bld.setPosition(i, false);
274 
275    if (i->cc != CC_ALWAYS)
276       checkPredicate(i);
277 
278    switch (i->op) {
279    case OP_PFETCH:
280       return handlePFETCH(i);
281    case OP_DFDX:
282    case OP_DFDY:
283       return handleDFDX(i);
284    case OP_POPCNT:
285       return handlePOPCNT(i);
286    default:
287       return NVC0LoweringPass::visit(i);
288    }
289 }
290 
291 } // namespace nv50_ir
292