1 /*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * @file
30 *
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
33 *
34 */
35
36 #include "radeon_program_alu.h"
37
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40
41
emit1(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg)42 static struct rc_instruction *emit1(
43 struct radeon_compiler * c, struct rc_instruction * after,
44 rc_opcode Opcode, struct rc_sub_instruction * base,
45 struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46 {
47 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49 if (base) {
50 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51 }
52
53 fpi->U.I.Opcode = Opcode;
54 fpi->U.I.DstReg = DstReg;
55 fpi->U.I.SrcReg[0] = SrcReg;
56 return fpi;
57 }
58
emit2(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1)59 static struct rc_instruction *emit2(
60 struct radeon_compiler * c, struct rc_instruction * after,
61 rc_opcode Opcode, struct rc_sub_instruction * base,
62 struct rc_dst_register DstReg,
63 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64 {
65 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66
67 if (base) {
68 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69 }
70
71 fpi->U.I.Opcode = Opcode;
72 fpi->U.I.DstReg = DstReg;
73 fpi->U.I.SrcReg[0] = SrcReg0;
74 fpi->U.I.SrcReg[1] = SrcReg1;
75 return fpi;
76 }
77
emit3(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1,struct rc_src_register SrcReg2)78 static struct rc_instruction *emit3(
79 struct radeon_compiler * c, struct rc_instruction * after,
80 rc_opcode Opcode, struct rc_sub_instruction * base,
81 struct rc_dst_register DstReg,
82 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83 struct rc_src_register SrcReg2)
84 {
85 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86
87 if (base) {
88 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89 }
90
91 fpi->U.I.Opcode = Opcode;
92 fpi->U.I.DstReg = DstReg;
93 fpi->U.I.SrcReg[0] = SrcReg0;
94 fpi->U.I.SrcReg[1] = SrcReg1;
95 fpi->U.I.SrcReg[2] = SrcReg2;
96 return fpi;
97 }
98
dstregtmpmask(int index,int mask)99 static struct rc_dst_register dstregtmpmask(int index, int mask)
100 {
101 struct rc_dst_register dst = {0, 0, 0};
102 dst.File = RC_FILE_TEMPORARY;
103 dst.Index = index;
104 dst.WriteMask = mask;
105 return dst;
106 }
107
108 static const struct rc_src_register builtin_zero = {
109 .File = RC_FILE_NONE,
110 .Index = 0,
111 .Swizzle = RC_SWIZZLE_0000
112 };
113 static const struct rc_src_register builtin_one = {
114 .File = RC_FILE_NONE,
115 .Index = 0,
116 .Swizzle = RC_SWIZZLE_1111
117 };
118
119 static const struct rc_src_register builtin_half = {
120 .File = RC_FILE_NONE,
121 .Index = 0,
122 .Swizzle = RC_SWIZZLE_HHHH
123 };
124
125 static const struct rc_src_register srcreg_undefined = {
126 .File = RC_FILE_NONE,
127 .Index = 0,
128 .Swizzle = RC_SWIZZLE_XYZW
129 };
130
srcreg(int file,int index)131 static struct rc_src_register srcreg(int file, int index)
132 {
133 struct rc_src_register src = srcreg_undefined;
134 src.File = file;
135 src.Index = index;
136 return src;
137 }
138
srcregswz(int file,int index,int swz)139 static struct rc_src_register srcregswz(int file, int index, int swz)
140 {
141 struct rc_src_register src = srcreg_undefined;
142 src.File = file;
143 src.Index = index;
144 src.Swizzle = swz;
145 return src;
146 }
147
absolute(struct rc_src_register reg)148 static struct rc_src_register absolute(struct rc_src_register reg)
149 {
150 struct rc_src_register newreg = reg;
151 newreg.Abs = 1;
152 newreg.Negate = RC_MASK_NONE;
153 return newreg;
154 }
155
negate(struct rc_src_register reg)156 static struct rc_src_register negate(struct rc_src_register reg)
157 {
158 struct rc_src_register newreg = reg;
159 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160 return newreg;
161 }
162
swizzle(struct rc_src_register reg,rc_swizzle x,rc_swizzle y,rc_swizzle z,rc_swizzle w)163 static struct rc_src_register swizzle(struct rc_src_register reg,
164 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165 {
166 struct rc_src_register swizzled = reg;
167 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168 return swizzled;
169 }
170
swizzle_smear(struct rc_src_register reg,rc_swizzle x)171 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172 rc_swizzle x)
173 {
174 return swizzle(reg, x, x, x, x);
175 }
176
swizzle_xxxx(struct rc_src_register reg)177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178 {
179 return swizzle_smear(reg, RC_SWIZZLE_X);
180 }
181
swizzle_yyyy(struct rc_src_register reg)182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183 {
184 return swizzle_smear(reg, RC_SWIZZLE_Y);
185 }
186
swizzle_zzzz(struct rc_src_register reg)187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188 {
189 return swizzle_smear(reg, RC_SWIZZLE_Z);
190 }
191
swizzle_wwww(struct rc_src_register reg)192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193 {
194 return swizzle_smear(reg, RC_SWIZZLE_W);
195 }
196
is_dst_safe_to_reuse(struct rc_instruction * inst)197 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198 {
199 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200 unsigned i;
201
202 assert(info->HasDstReg);
203
204 if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205 return 0;
206
207 for (i = 0; i < info->NumSrcRegs; i++) {
208 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209 inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210 return 0;
211 }
212
213 return 1;
214 }
215
try_to_reuse_dst(struct radeon_compiler * c,struct rc_instruction * inst)216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217 struct rc_instruction *inst)
218 {
219 unsigned tmp;
220
221 if (is_dst_safe_to_reuse(inst))
222 tmp = inst->U.I.DstReg.Index;
223 else
224 tmp = rc_find_free_temporary(c);
225
226 return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227 }
228
transform_ABS(struct radeon_compiler * c,struct rc_instruction * inst)229 static void transform_ABS(struct radeon_compiler* c,
230 struct rc_instruction* inst)
231 {
232 struct rc_src_register src = inst->U.I.SrcReg[0];
233 src.Abs = 1;
234 src.Negate = RC_MASK_NONE;
235 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236 rc_remove_instruction(inst);
237 }
238
transform_CEIL(struct radeon_compiler * c,struct rc_instruction * inst)239 static void transform_CEIL(struct radeon_compiler* c,
240 struct rc_instruction* inst)
241 {
242 /* Assuming:
243 * ceil(x) = -floor(-x)
244 *
245 * After inlining floor:
246 * ceil(x) = -(-x-frac(-x))
247 *
248 * After simplification:
249 * ceil(x) = x+frac(-x)
250 */
251
252 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256 rc_remove_instruction(inst);
257 }
258
transform_CLAMP(struct radeon_compiler * c,struct rc_instruction * inst)259 static void transform_CLAMP(struct radeon_compiler *c,
260 struct rc_instruction *inst)
261 {
262 /* CLAMP dst, src, min, max
263 * into:
264 * MIN tmp, src, max
265 * MAX dst, tmp, min
266 */
267 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268 emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270 emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272 rc_remove_instruction(inst);
273 }
274
transform_DP2(struct radeon_compiler * c,struct rc_instruction * inst)275 static void transform_DP2(struct radeon_compiler* c,
276 struct rc_instruction* inst)
277 {
278 struct rc_src_register src0 = inst->U.I.SrcReg[0];
279 struct rc_src_register src1 = inst->U.I.SrcReg[1];
280 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281 src0.Swizzle &= ~(63 << (3 * 2));
282 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284 src1.Swizzle &= ~(63 << (3 * 2));
285 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286 emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287 rc_remove_instruction(inst);
288 }
289
transform_DPH(struct radeon_compiler * c,struct rc_instruction * inst)290 static void transform_DPH(struct radeon_compiler* c,
291 struct rc_instruction* inst)
292 {
293 struct rc_src_register src0 = inst->U.I.SrcReg[0];
294 src0.Negate &= ~RC_MASK_W;
295 src0.Swizzle &= ~(7 << (3 * 3));
296 src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298 rc_remove_instruction(inst);
299 }
300
301 /**
302 * [1, src0.y*src1.y, src0.z, src1.w]
303 * So basically MUL with lotsa swizzling.
304 */
transform_DST(struct radeon_compiler * c,struct rc_instruction * inst)305 static void transform_DST(struct radeon_compiler* c,
306 struct rc_instruction* inst)
307 {
308 emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311 rc_remove_instruction(inst);
312 }
313
transform_FLR(struct radeon_compiler * c,struct rc_instruction * inst)314 static void transform_FLR(struct radeon_compiler* c,
315 struct rc_instruction* inst)
316 {
317 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321 rc_remove_instruction(inst);
322 }
323
transform_TRUNC(struct radeon_compiler * c,struct rc_instruction * inst)324 static void transform_TRUNC(struct radeon_compiler* c,
325 struct rc_instruction* inst)
326 {
327 /* Definition of trunc:
328 * trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
329 *
330 * The multiplication by sgn(x) can be simplified using CMP:
331 * y * sgn(x) = (x < 0 ? -y : y)
332 */
333 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
334 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
335 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
336 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
337 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
338 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
339 rc_remove_instruction(inst);
340 }
341
342 /**
343 * Definition of LIT (from ARB_fragment_program):
344 *
345 * tmp = VectorLoad(op0);
346 * if (tmp.x < 0) tmp.x = 0;
347 * if (tmp.y < 0) tmp.y = 0;
348 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
350 * result.x = 1.0;
351 * result.y = tmp.x;
352 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
353 * result.w = 1.0;
354 *
355 * The longest path of computation is the one leading to result.z,
356 * consisting of 5 operations. This implementation of LIT takes
357 * 5 slots, if the subsequent optimization passes are clever enough
358 * to pair instructions correctly.
359 */
transform_LIT(struct radeon_compiler * c,struct rc_instruction * inst)360 static void transform_LIT(struct radeon_compiler* c,
361 struct rc_instruction* inst)
362 {
363 unsigned int constant;
364 unsigned int constant_swizzle;
365 unsigned int temp;
366 struct rc_src_register srctemp;
367
368 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
369
370 if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
371 struct rc_instruction * inst_mov;
372
373 inst_mov = emit1(c, inst,
374 RC_OPCODE_MOV, 0, inst->U.I.DstReg,
375 srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
376
377 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
378 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
379 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
380 }
381
382 temp = inst->U.I.DstReg.Index;
383 srctemp = srcreg(RC_FILE_TEMPORARY, temp);
384
385 /* tmp.x = max(0.0, Src.x); */
386 /* tmp.y = max(0.0, Src.y); */
387 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
389 dstregtmpmask(temp, RC_MASK_XYW),
390 inst->U.I.SrcReg[0],
391 swizzle(srcreg(RC_FILE_CONSTANT, constant),
392 RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
393 emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
394 dstregtmpmask(temp, RC_MASK_Z),
395 swizzle_wwww(srctemp),
396 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
397
398 /* tmp.w = Pow(tmp.y, tmp.w) */
399 emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
400 dstregtmpmask(temp, RC_MASK_W),
401 swizzle_yyyy(srctemp));
402 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
403 dstregtmpmask(temp, RC_MASK_W),
404 swizzle_wwww(srctemp),
405 swizzle_zzzz(srctemp));
406 emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
407 dstregtmpmask(temp, RC_MASK_W),
408 swizzle_wwww(srctemp));
409
410 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
412 dstregtmpmask(temp, RC_MASK_Z),
413 negate(swizzle_xxxx(srctemp)),
414 swizzle_wwww(srctemp),
415 builtin_zero);
416
417 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
419 dstregtmpmask(temp, RC_MASK_XYW),
420 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
421
422 rc_remove_instruction(inst);
423 }
424
transform_LRP(struct radeon_compiler * c,struct rc_instruction * inst)425 static void transform_LRP(struct radeon_compiler* c,
426 struct rc_instruction* inst)
427 {
428 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429
430 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
431 dst,
432 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
433 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
434 inst->U.I.DstReg,
435 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
436
437 rc_remove_instruction(inst);
438 }
439
transform_POW(struct radeon_compiler * c,struct rc_instruction * inst)440 static void transform_POW(struct radeon_compiler* c,
441 struct rc_instruction* inst)
442 {
443 struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
444 struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
445 tempdst.WriteMask = RC_MASK_W;
446 tempsrc.Swizzle = RC_SWIZZLE_WWWW;
447
448 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
449 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
450 emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
451
452 rc_remove_instruction(inst);
453 }
454
455 /* dst = ROUND(src) :
456 * add = src + .5
457 * frac = FRC(add)
458 * dst = add - frac
459 *
460 * According to the GLSL spec, the implementor can decide which way to round
461 * when the fraction is .5. We round down for .5.
462 *
463 */
transform_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)464 static void transform_ROUND(struct radeon_compiler* c,
465 struct rc_instruction* inst)
466 {
467 unsigned int mask = inst->U.I.DstReg.WriteMask;
468 unsigned int frac_index, add_index;
469 struct rc_dst_register frac_dst, add_dst;
470 struct rc_src_register frac_src, add_src;
471
472 /* add = src + .5 */
473 add_index = rc_find_free_temporary(c);
474 add_dst = dstregtmpmask(add_index, mask);
475 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
476 builtin_half);
477 add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
478
479
480 /* frac = FRC(add) */
481 frac_index = rc_find_free_temporary(c);
482 frac_dst = dstregtmpmask(frac_index, mask);
483 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
484 frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
485
486 /* dst = add - frac */
487 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
488 add_src, negate(frac_src));
489 rc_remove_instruction(inst);
490 }
491
transform_RSQ(struct radeon_compiler * c,struct rc_instruction * inst)492 static void transform_RSQ(struct radeon_compiler* c,
493 struct rc_instruction* inst)
494 {
495 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
496 }
497
transform_SEQ(struct radeon_compiler * c,struct rc_instruction * inst)498 static void transform_SEQ(struct radeon_compiler* c,
499 struct rc_instruction* inst)
500 {
501 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
502
503 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
504 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
505 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
506
507 rc_remove_instruction(inst);
508 }
509
transform_SFL(struct radeon_compiler * c,struct rc_instruction * inst)510 static void transform_SFL(struct radeon_compiler* c,
511 struct rc_instruction* inst)
512 {
513 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
514 rc_remove_instruction(inst);
515 }
516
transform_SGE(struct radeon_compiler * c,struct rc_instruction * inst)517 static void transform_SGE(struct radeon_compiler* c,
518 struct rc_instruction* inst)
519 {
520 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
521
522 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
523 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
524 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
525
526 rc_remove_instruction(inst);
527 }
528
transform_SGT(struct radeon_compiler * c,struct rc_instruction * inst)529 static void transform_SGT(struct radeon_compiler* c,
530 struct rc_instruction* inst)
531 {
532 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
533
534 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
535 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
536 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
537
538 rc_remove_instruction(inst);
539 }
540
transform_SLE(struct radeon_compiler * c,struct rc_instruction * inst)541 static void transform_SLE(struct radeon_compiler* c,
542 struct rc_instruction* inst)
543 {
544 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
545
546 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
547 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
548 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
549
550 rc_remove_instruction(inst);
551 }
552
transform_SLT(struct radeon_compiler * c,struct rc_instruction * inst)553 static void transform_SLT(struct radeon_compiler* c,
554 struct rc_instruction* inst)
555 {
556 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
557
558 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
559 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
560 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
561
562 rc_remove_instruction(inst);
563 }
564
transform_SNE(struct radeon_compiler * c,struct rc_instruction * inst)565 static void transform_SNE(struct radeon_compiler* c,
566 struct rc_instruction* inst)
567 {
568 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
569
570 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
571 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
572 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
573
574 rc_remove_instruction(inst);
575 }
576
transform_SSG(struct radeon_compiler * c,struct rc_instruction * inst)577 static void transform_SSG(struct radeon_compiler* c,
578 struct rc_instruction* inst)
579 {
580 /* result = sign(x)
581 *
582 * CMP tmp0, -x, 1, 0
583 * CMP tmp1, x, 1, 0
584 * ADD result, tmp0, -tmp1;
585 */
586 struct rc_dst_register dst0;
587 unsigned tmp1;
588
589 /* 0 < x */
590 dst0 = try_to_reuse_dst(c, inst);
591 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
592 dst0,
593 negate(inst->U.I.SrcReg[0]),
594 builtin_one,
595 builtin_zero);
596
597 /* x < 0 */
598 tmp1 = rc_find_free_temporary(c);
599 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
600 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
601 inst->U.I.SrcReg[0],
602 builtin_one,
603 builtin_zero);
604
605 /* Either both are zero, or one of them is one and the other is zero. */
606 /* result = tmp0 - tmp1 */
607 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
608 inst->U.I.DstReg,
609 srcreg(RC_FILE_TEMPORARY, dst0.Index),
610 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
611
612 rc_remove_instruction(inst);
613 }
614
transform_SUB(struct radeon_compiler * c,struct rc_instruction * inst)615 static void transform_SUB(struct radeon_compiler* c,
616 struct rc_instruction* inst)
617 {
618 inst->U.I.Opcode = RC_OPCODE_ADD;
619 inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
620 }
621
transform_SWZ(struct radeon_compiler * c,struct rc_instruction * inst)622 static void transform_SWZ(struct radeon_compiler* c,
623 struct rc_instruction* inst)
624 {
625 inst->U.I.Opcode = RC_OPCODE_MOV;
626 }
627
transform_XPD(struct radeon_compiler * c,struct rc_instruction * inst)628 static void transform_XPD(struct radeon_compiler* c,
629 struct rc_instruction* inst)
630 {
631 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
632
633 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
634 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
635 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
636 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
637 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
638 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
639 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
640
641 rc_remove_instruction(inst);
642 }
643
644
645 /**
646 * Can be used as a transformation for @ref radeonClauseLocalTransform,
647 * no userData necessary.
648 *
649 * Eliminates the following ALU instructions:
650 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
651 * using:
652 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
653 *
654 * Transforms RSQ to Radeon's native RSQ by explicitly setting
655 * absolute value.
656 *
657 * @note should be applicable to R300 and R500 fragment programs.
658 */
radeonTransformALU(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)659 int radeonTransformALU(
660 struct radeon_compiler * c,
661 struct rc_instruction* inst,
662 void* unused)
663 {
664 switch(inst->U.I.Opcode) {
665 case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
666 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
667 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
668 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
669 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
670 case RC_OPCODE_DST: transform_DST(c, inst); return 1;
671 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
672 case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
673 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
674 case RC_OPCODE_POW: transform_POW(c, inst); return 1;
675 case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
676 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
677 case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
678 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
679 case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
680 case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
681 case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
682 case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
683 case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
684 case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
685 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
686 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
687 case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
688 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
689 default:
690 return 0;
691 }
692 }
693
694
transform_r300_vertex_ABS(struct radeon_compiler * c,struct rc_instruction * inst)695 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
696 struct rc_instruction* inst)
697 {
698 /* Note: r500 can take absolute values, but r300 cannot. */
699 inst->U.I.Opcode = RC_OPCODE_MAX;
700 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
701 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
702 }
703
transform_r300_vertex_CMP(struct radeon_compiler * c,struct rc_instruction * inst)704 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
705 struct rc_instruction* inst)
706 {
707 /* There is no decent CMP available, so let's rig one up.
708 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
709 * The following sequence consumes zero to two temps and two extra slots
710 * (the second temp and the second slot is consumed by transform_LRP),
711 * but should be equivalent:
712 *
713 * SLT tmp0, src0, 0.0
714 * LRP dst, tmp0, src1, src2
715 *
716 * Yes, I know, I'm a mad scientist. ~ C. & M. */
717 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
718
719 /* SLT tmp0, src0, 0.0 */
720 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
721 dst,
722 inst->U.I.SrcReg[0], builtin_zero);
723
724 /* LRP dst, tmp0, src1, src2 */
725 transform_LRP(c,
726 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
727 inst->U.I.DstReg,
728 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
729
730 rc_remove_instruction(inst);
731 }
732
transform_r300_vertex_DP2(struct radeon_compiler * c,struct rc_instruction * inst)733 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
734 struct rc_instruction* inst)
735 {
736 struct rc_instruction *next_inst = inst->Next;
737 transform_DP2(c, inst);
738 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
739 }
740
transform_r300_vertex_DP3(struct radeon_compiler * c,struct rc_instruction * inst)741 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
742 struct rc_instruction* inst)
743 {
744 struct rc_src_register src0 = inst->U.I.SrcReg[0];
745 struct rc_src_register src1 = inst->U.I.SrcReg[1];
746 src0.Negate &= ~RC_MASK_W;
747 src0.Swizzle &= ~(7 << (3 * 3));
748 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
749 src1.Negate &= ~RC_MASK_W;
750 src1.Swizzle &= ~(7 << (3 * 3));
751 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
752 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
753 rc_remove_instruction(inst);
754 }
755
transform_r300_vertex_fix_LIT(struct radeon_compiler * c,struct rc_instruction * inst)756 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
757 struct rc_instruction* inst)
758 {
759 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
760 unsigned constant_swizzle;
761 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
762 0.0000000000000000001,
763 &constant_swizzle);
764
765 /* MOV dst, src */
766 dst.WriteMask = RC_MASK_XYZW;
767 emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
768 dst,
769 inst->U.I.SrcReg[0]);
770
771 /* MAX dst.y, src, 0.00...001 */
772 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
773 dstregtmpmask(dst.Index, RC_MASK_Y),
774 srcreg(RC_FILE_TEMPORARY, dst.Index),
775 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
776
777 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
778 }
779
transform_r300_vertex_SEQ(struct radeon_compiler * c,struct rc_instruction * inst)780 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
781 struct rc_instruction *inst)
782 {
783 /* x = y <==> x >= y && y >= x */
784 int tmp = rc_find_free_temporary(c);
785
786 /* x <= y */
787 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
788 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
789 inst->U.I.SrcReg[0],
790 inst->U.I.SrcReg[1]);
791
792 /* y <= x */
793 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
794 inst->U.I.DstReg,
795 inst->U.I.SrcReg[1],
796 inst->U.I.SrcReg[0]);
797
798 /* x && y = x * y */
799 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
800 inst->U.I.DstReg,
801 srcreg(RC_FILE_TEMPORARY, tmp),
802 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
803
804 rc_remove_instruction(inst);
805 }
806
transform_r300_vertex_SNE(struct radeon_compiler * c,struct rc_instruction * inst)807 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
808 struct rc_instruction *inst)
809 {
810 /* x != y <==> x < y || y < x */
811 int tmp = rc_find_free_temporary(c);
812
813 /* x < y */
814 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
815 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
816 inst->U.I.SrcReg[0],
817 inst->U.I.SrcReg[1]);
818
819 /* y < x */
820 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
821 inst->U.I.DstReg,
822 inst->U.I.SrcReg[1],
823 inst->U.I.SrcReg[0]);
824
825 /* x || y = max(x, y) */
826 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
827 inst->U.I.DstReg,
828 srcreg(RC_FILE_TEMPORARY, tmp),
829 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
830
831 rc_remove_instruction(inst);
832 }
833
transform_r300_vertex_SGT(struct radeon_compiler * c,struct rc_instruction * inst)834 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
835 struct rc_instruction* inst)
836 {
837 /* x > y <==> -x < -y */
838 inst->U.I.Opcode = RC_OPCODE_SLT;
839 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
840 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
841 }
842
transform_r300_vertex_SLE(struct radeon_compiler * c,struct rc_instruction * inst)843 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
844 struct rc_instruction* inst)
845 {
846 /* x <= y <==> -x >= -y */
847 inst->U.I.Opcode = RC_OPCODE_SGE;
848 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
849 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
850 }
851
transform_r300_vertex_SSG(struct radeon_compiler * c,struct rc_instruction * inst)852 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
853 struct rc_instruction* inst)
854 {
855 /* result = sign(x)
856 *
857 * SLT tmp0, 0, x;
858 * SLT tmp1, x, 0;
859 * ADD result, tmp0, -tmp1;
860 */
861 struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
862 unsigned tmp1;
863
864 /* 0 < x */
865 dst0 = try_to_reuse_dst(c, inst);
866 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
867 dst0,
868 builtin_zero,
869 inst->U.I.SrcReg[0]);
870
871 /* x < 0 */
872 tmp1 = rc_find_free_temporary(c);
873 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
874 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
875 inst->U.I.SrcReg[0],
876 builtin_zero);
877
878 /* Either both are zero, or one of them is one and the other is zero. */
879 /* result = tmp0 - tmp1 */
880 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
881 inst->U.I.DstReg,
882 srcreg(RC_FILE_TEMPORARY, dst0.Index),
883 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
884
885 rc_remove_instruction(inst);
886 }
887
transform_vertex_TRUNC(struct radeon_compiler * c,struct rc_instruction * inst)888 static void transform_vertex_TRUNC(struct radeon_compiler* c,
889 struct rc_instruction* inst)
890 {
891 struct rc_instruction *next = inst->Next;
892
893 /* next->Prev is removed after each transformation and replaced
894 * by a new instruction. */
895 transform_TRUNC(c, next->Prev);
896 transform_r300_vertex_CMP(c, next->Prev);
897 }
898
899 /**
900 * For use with rc_local_transform, this transforms non-native ALU
901 * instructions of the r300 up to r500 vertex engine.
902 */
r300_transform_vertex_alu(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)903 int r300_transform_vertex_alu(
904 struct radeon_compiler * c,
905 struct rc_instruction* inst,
906 void* unused)
907 {
908 switch(inst->U.I.Opcode) {
909 case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
910 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
911 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
912 case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
913 case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
914 case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
915 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
916 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
917 case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
918 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
919 case RC_OPCODE_SEQ:
920 if (!c->is_r500) {
921 transform_r300_vertex_SEQ(c, inst);
922 return 1;
923 }
924 return 0;
925 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
926 case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
927 case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
928 case RC_OPCODE_SNE:
929 if (!c->is_r500) {
930 transform_r300_vertex_SNE(c, inst);
931 return 1;
932 }
933 return 0;
934 case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
935 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
936 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
937 case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
938 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
939 default:
940 return 0;
941 }
942 }
943
sincos_constants(struct radeon_compiler * c,unsigned int * constants)944 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
945 {
946 static const float SinCosConsts[2][4] = {
947 {
948 1.273239545, /* 4/PI */
949 -0.405284735, /* -4/(PI*PI) */
950 3.141592654, /* PI */
951 0.2225 /* weight */
952 },
953 {
954 0.75,
955 0.5,
956 0.159154943, /* 1/(2*PI) */
957 6.283185307 /* 2*PI */
958 }
959 };
960 int i;
961
962 for(i = 0; i < 2; ++i)
963 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
964 }
965
966 /**
967 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
968 *
969 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970 * MAD tmp.x, tmp.y, |src|, tmp.x
971 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972 * MAD dest, tmp.y, weight, tmp.x
973 */
sin_approx(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_dst_register dst,struct rc_src_register src,const unsigned int * constants)974 static void sin_approx(
975 struct radeon_compiler* c, struct rc_instruction * inst,
976 struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
977 {
978 unsigned int tempreg = rc_find_free_temporary(c);
979
980 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
981 swizzle_xxxx(src),
982 srcreg(RC_FILE_CONSTANT, constants[0]));
983 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
984 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
985 absolute(swizzle_xxxx(src)),
986 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
987 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
988 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
989 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
990 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
991 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
992 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
993 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
994 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
995 }
996
997 /**
998 * Translate the trigonometric functions COS, SIN, and SCS
999 * using only the basic instructions
1000 * MOV, ADD, MUL, MAD, FRC
1001 */
r300_transform_trig_simple(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1002 int r300_transform_trig_simple(struct radeon_compiler* c,
1003 struct rc_instruction* inst,
1004 void* unused)
1005 {
1006 unsigned int constants[2];
1007 unsigned int tempreg;
1008
1009 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1010 inst->U.I.Opcode != RC_OPCODE_SIN &&
1011 inst->U.I.Opcode != RC_OPCODE_SCS)
1012 return 0;
1013
1014 tempreg = rc_find_free_temporary(c);
1015
1016 sincos_constants(c, constants);
1017
1018 if (inst->U.I.Opcode == RC_OPCODE_COS) {
1019 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020 /* FRC tmp.x, tmp.x */
1021 /* MAD tmp.z, tmp.x, 2*PI, -PI */
1022 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1023 swizzle_xxxx(inst->U.I.SrcReg[0]),
1024 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1025 swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
1026 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1027 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1028 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1029 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1030 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1031 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1032
1033 sin_approx(c, inst, inst->U.I.DstReg,
1034 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1035 constants);
1036 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1037 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1038 swizzle_xxxx(inst->U.I.SrcReg[0]),
1039 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1040 swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1041 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1042 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1043 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1044 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1045 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1046 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1047
1048 sin_approx(c, inst, inst->U.I.DstReg,
1049 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1050 constants);
1051 } else {
1052 struct rc_dst_register dst;
1053
1054 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1055 swizzle_xxxx(inst->U.I.SrcReg[0]),
1056 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1057 swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1058 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1059 srcreg(RC_FILE_TEMPORARY, tempreg));
1060 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1061 srcreg(RC_FILE_TEMPORARY, tempreg),
1062 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1063 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1064
1065 dst = inst->U.I.DstReg;
1066
1067 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1068 sin_approx(c, inst, dst,
1069 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1070 constants);
1071
1072 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1073 sin_approx(c, inst, dst,
1074 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1075 constants);
1076 }
1077
1078 rc_remove_instruction(inst);
1079
1080 return 1;
1081 }
1082
r300_transform_SIN_COS_SCS(struct radeon_compiler * c,struct rc_instruction * inst,unsigned srctmp)1083 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1084 struct rc_instruction *inst,
1085 unsigned srctmp)
1086 {
1087 if (inst->U.I.Opcode == RC_OPCODE_COS) {
1088 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1089 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1090 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1091 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1092 inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1093 } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1094 struct rc_dst_register moddst = inst->U.I.DstReg;
1095
1096 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1097 moddst.WriteMask = RC_MASK_X;
1098 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1099 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1100 }
1101 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1102 moddst.WriteMask = RC_MASK_Y;
1103 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1104 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1105 }
1106 }
1107
1108 rc_remove_instruction(inst);
1109 }
1110
1111
1112 /**
1113 * Transform the trigonometric functions COS, SIN, and SCS
1114 * to include pre-scaling by 1/(2*PI) and taking the fractional
1115 * part, so that the input to COS and SIN is always in the range [0,1).
1116 * SCS is replaced by one COS and one SIN instruction.
1117 *
1118 * @warning This transformation implicitly changes the semantics of SIN and COS!
1119 */
radeonTransformTrigScale(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1120 int radeonTransformTrigScale(struct radeon_compiler* c,
1121 struct rc_instruction* inst,
1122 void* unused)
1123 {
1124 static const float RCP_2PI = 0.15915494309189535;
1125 unsigned int temp;
1126 unsigned int constant;
1127 unsigned int constant_swizzle;
1128
1129 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130 inst->U.I.Opcode != RC_OPCODE_SIN &&
1131 inst->U.I.Opcode != RC_OPCODE_SCS)
1132 return 0;
1133
1134 temp = rc_find_free_temporary(c);
1135 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1136
1137 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1138 swizzle_xxxx(inst->U.I.SrcReg[0]),
1139 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1140 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1141 srcreg(RC_FILE_TEMPORARY, temp));
1142
1143 r300_transform_SIN_COS_SCS(c, inst, temp);
1144 return 1;
1145 }
1146
1147 /**
1148 * Transform the trigonometric functions COS, SIN, and SCS
1149 * so that the input to COS and SIN is always in the range [-PI, PI].
1150 * SCS is replaced by one COS and one SIN instruction.
1151 */
r300_transform_trig_scale_vertex(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1152 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1153 struct rc_instruction *inst,
1154 void *unused)
1155 {
1156 static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1157 unsigned int temp;
1158 unsigned int constant;
1159
1160 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1161 inst->U.I.Opcode != RC_OPCODE_SIN &&
1162 inst->U.I.Opcode != RC_OPCODE_SCS)
1163 return 0;
1164
1165 /* Repeat x in the range [-PI, PI]:
1166 *
1167 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1168 */
1169
1170 temp = rc_find_free_temporary(c);
1171 constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1172
1173 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1174 swizzle_xxxx(inst->U.I.SrcReg[0]),
1175 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1176 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1177 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1178 srcreg(RC_FILE_TEMPORARY, temp));
1179 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1180 srcreg(RC_FILE_TEMPORARY, temp),
1181 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1182 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1183
1184 r300_transform_SIN_COS_SCS(c, inst, temp);
1185 return 1;
1186 }
1187
1188 /**
1189 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1192 *
1193 * @warning This explicitly changes the form of DDX and DDY!
1194 */
1195
radeonTransformDeriv(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1196 int radeonTransformDeriv(struct radeon_compiler* c,
1197 struct rc_instruction* inst,
1198 void* unused)
1199 {
1200 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1201 return 0;
1202
1203 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1204 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1205
1206 return 1;
1207 }
1208
1209 /**
1210 * IF Temp[0].x -> IF Temp[0].x
1211 * ... -> ...
1212 * KILP -> KIL -abs(Temp[0].x)
1213 * ... -> ...
1214 * ENDIF -> ENDIF
1215 *
1216 * === OR ===
1217 *
1218 * IF Temp[0].x -\
1219 * KILP - > KIL -abs(Temp[0].x)
1220 * ENDIF -/
1221 *
1222 * === OR ===
1223 *
1224 * IF Temp[0].x -> IF Temp[0].x
1225 * ... -> ...
1226 * ELSE -> ELSE
1227 * ... -> ...
1228 * KILP -> KIL -abs(Temp[0].x)
1229 * ... -> ...
1230 * ENDIF -> ENDIF
1231 *
1232 * === OR ===
1233 *
1234 * KILP -> KIL -none.1111
1235 *
1236 * This needs to be done in its own pass, because it might modify the
1237 * instructions before and after KILP.
1238 */
rc_transform_KILP(struct radeon_compiler * c,void * user)1239 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1240 {
1241 struct rc_instruction * inst;
1242 for (inst = c->Program.Instructions.Next;
1243 inst != &c->Program.Instructions; inst = inst->Next) {
1244 struct rc_instruction * if_inst;
1245 unsigned in_if = 0;
1246
1247 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1248 continue;
1249
1250 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1251 if_inst = if_inst->Prev) {
1252
1253 if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1254 in_if = 1;
1255 break;
1256 }
1257 }
1258
1259 inst->U.I.Opcode = RC_OPCODE_KIL;
1260
1261 if (!in_if) {
1262 inst->U.I.SrcReg[0] = negate(builtin_one);
1263 } else {
1264 /* This should work even if the KILP is inside the ELSE
1265 * block, because -0.0 is considered negative. */
1266 inst->U.I.SrcReg[0] =
1267 negate(absolute(if_inst->U.I.SrcReg[0]));
1268
1269 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1270 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1271
1272 /* Optimize the special case:
1273 * IF Temp[0].x
1274 * KILP
1275 * ENDIF
1276 */
1277
1278 /* Remove IF */
1279 rc_remove_instruction(inst->Prev);
1280 /* Remove ENDIF */
1281 rc_remove_instruction(inst->Next);
1282 }
1283 }
1284 }
1285 }
1286