1 /*
2  * Copyright (C) 2008 Nicolai Haehnle.
3  *
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial
16  * portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  */
27 
28 /**
29  * @file
30  *
31  * Shareable transformations that transform "special" ALU instructions
32  * into ALU instructions that are supported by hardware.
33  *
34  */
35 
36 #include "radeon_program_alu.h"
37 
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40 
41 
emit1(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg)42 static struct rc_instruction *emit1(
43 	struct radeon_compiler * c, struct rc_instruction * after,
44 	rc_opcode Opcode, struct rc_sub_instruction * base,
45 	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46 {
47 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48 
49 	if (base) {
50 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51 	}
52 
53 	fpi->U.I.Opcode = Opcode;
54 	fpi->U.I.DstReg = DstReg;
55 	fpi->U.I.SrcReg[0] = SrcReg;
56 	return fpi;
57 }
58 
emit2(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1)59 static struct rc_instruction *emit2(
60 	struct radeon_compiler * c, struct rc_instruction * after,
61 	rc_opcode Opcode, struct rc_sub_instruction * base,
62 	struct rc_dst_register DstReg,
63 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64 {
65 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66 
67 	if (base) {
68 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69 	}
70 
71 	fpi->U.I.Opcode = Opcode;
72 	fpi->U.I.DstReg = DstReg;
73 	fpi->U.I.SrcReg[0] = SrcReg0;
74 	fpi->U.I.SrcReg[1] = SrcReg1;
75 	return fpi;
76 }
77 
emit3(struct radeon_compiler * c,struct rc_instruction * after,rc_opcode Opcode,struct rc_sub_instruction * base,struct rc_dst_register DstReg,struct rc_src_register SrcReg0,struct rc_src_register SrcReg1,struct rc_src_register SrcReg2)78 static struct rc_instruction *emit3(
79 	struct radeon_compiler * c, struct rc_instruction * after,
80 	rc_opcode Opcode, struct rc_sub_instruction * base,
81 	struct rc_dst_register DstReg,
82 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83 	struct rc_src_register SrcReg2)
84 {
85 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86 
87 	if (base) {
88 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89 	}
90 
91 	fpi->U.I.Opcode = Opcode;
92 	fpi->U.I.DstReg = DstReg;
93 	fpi->U.I.SrcReg[0] = SrcReg0;
94 	fpi->U.I.SrcReg[1] = SrcReg1;
95 	fpi->U.I.SrcReg[2] = SrcReg2;
96 	return fpi;
97 }
98 
dstregtmpmask(int index,int mask)99 static struct rc_dst_register dstregtmpmask(int index, int mask)
100 {
101 	struct rc_dst_register dst = {0, 0, 0};
102 	dst.File = RC_FILE_TEMPORARY;
103 	dst.Index = index;
104 	dst.WriteMask = mask;
105 	return dst;
106 }
107 
108 static const struct rc_src_register builtin_zero = {
109 	.File = RC_FILE_NONE,
110 	.Index = 0,
111 	.Swizzle = RC_SWIZZLE_0000
112 };
113 static const struct rc_src_register builtin_one = {
114 	.File = RC_FILE_NONE,
115 	.Index = 0,
116 	.Swizzle = RC_SWIZZLE_1111
117 };
118 
119 static const struct rc_src_register builtin_half = {
120 	.File = RC_FILE_NONE,
121 	.Index = 0,
122 	.Swizzle = RC_SWIZZLE_HHHH
123 };
124 
125 static const struct rc_src_register srcreg_undefined = {
126 	.File = RC_FILE_NONE,
127 	.Index = 0,
128 	.Swizzle = RC_SWIZZLE_XYZW
129 };
130 
srcreg(int file,int index)131 static struct rc_src_register srcreg(int file, int index)
132 {
133 	struct rc_src_register src = srcreg_undefined;
134 	src.File = file;
135 	src.Index = index;
136 	return src;
137 }
138 
srcregswz(int file,int index,int swz)139 static struct rc_src_register srcregswz(int file, int index, int swz)
140 {
141 	struct rc_src_register src = srcreg_undefined;
142 	src.File = file;
143 	src.Index = index;
144 	src.Swizzle = swz;
145 	return src;
146 }
147 
absolute(struct rc_src_register reg)148 static struct rc_src_register absolute(struct rc_src_register reg)
149 {
150 	struct rc_src_register newreg = reg;
151 	newreg.Abs = 1;
152 	newreg.Negate = RC_MASK_NONE;
153 	return newreg;
154 }
155 
negate(struct rc_src_register reg)156 static struct rc_src_register negate(struct rc_src_register reg)
157 {
158 	struct rc_src_register newreg = reg;
159 	newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160 	return newreg;
161 }
162 
swizzle(struct rc_src_register reg,rc_swizzle x,rc_swizzle y,rc_swizzle z,rc_swizzle w)163 static struct rc_src_register swizzle(struct rc_src_register reg,
164 		rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165 {
166 	struct rc_src_register swizzled = reg;
167 	swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168 	return swizzled;
169 }
170 
swizzle_smear(struct rc_src_register reg,rc_swizzle x)171 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172 		rc_swizzle x)
173 {
174 	return swizzle(reg, x, x, x, x);
175 }
176 
swizzle_xxxx(struct rc_src_register reg)177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178 {
179 	return swizzle_smear(reg, RC_SWIZZLE_X);
180 }
181 
swizzle_yyyy(struct rc_src_register reg)182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183 {
184 	return swizzle_smear(reg, RC_SWIZZLE_Y);
185 }
186 
swizzle_zzzz(struct rc_src_register reg)187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188 {
189 	return swizzle_smear(reg, RC_SWIZZLE_Z);
190 }
191 
swizzle_wwww(struct rc_src_register reg)192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193 {
194 	return swizzle_smear(reg, RC_SWIZZLE_W);
195 }
196 
is_dst_safe_to_reuse(struct rc_instruction * inst)197 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198 {
199 	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200 	unsigned i;
201 
202 	assert(info->HasDstReg);
203 
204 	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205 		return 0;
206 
207 	for (i = 0; i < info->NumSrcRegs; i++) {
208 		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209 		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210 			return 0;
211 	}
212 
213 	return 1;
214 }
215 
try_to_reuse_dst(struct radeon_compiler * c,struct rc_instruction * inst)216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217 					       struct rc_instruction *inst)
218 {
219 	unsigned tmp;
220 
221 	if (is_dst_safe_to_reuse(inst))
222 		tmp = inst->U.I.DstReg.Index;
223 	else
224 		tmp = rc_find_free_temporary(c);
225 
226 	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227 }
228 
transform_ABS(struct radeon_compiler * c,struct rc_instruction * inst)229 static void transform_ABS(struct radeon_compiler* c,
230 	struct rc_instruction* inst)
231 {
232 	struct rc_src_register src = inst->U.I.SrcReg[0];
233 	src.Abs = 1;
234 	src.Negate = RC_MASK_NONE;
235 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236 	rc_remove_instruction(inst);
237 }
238 
transform_CEIL(struct radeon_compiler * c,struct rc_instruction * inst)239 static void transform_CEIL(struct radeon_compiler* c,
240 	struct rc_instruction* inst)
241 {
242 	/* Assuming:
243 	 *     ceil(x) = -floor(-x)
244 	 *
245 	 * After inlining floor:
246 	 *     ceil(x) = -(-x-frac(-x))
247 	 *
248 	 * After simplification:
249 	 *     ceil(x) = x+frac(-x)
250 	 */
251 
252 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254 	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256 	rc_remove_instruction(inst);
257 }
258 
transform_CLAMP(struct radeon_compiler * c,struct rc_instruction * inst)259 static void transform_CLAMP(struct radeon_compiler *c,
260 	struct rc_instruction *inst)
261 {
262 	/* CLAMP dst, src, min, max
263 	 *    into:
264 	 * MIN tmp, src, max
265 	 * MAX dst, tmp, min
266 	 */
267 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269 		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270 	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271 		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272 	rc_remove_instruction(inst);
273 }
274 
transform_DP2(struct radeon_compiler * c,struct rc_instruction * inst)275 static void transform_DP2(struct radeon_compiler* c,
276 	struct rc_instruction* inst)
277 {
278 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
279 	struct rc_src_register src1 = inst->U.I.SrcReg[1];
280 	src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281 	src0.Swizzle &= ~(63 << (3 * 2));
282 	src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283 	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284 	src1.Swizzle &= ~(63 << (3 * 2));
285 	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286 	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287 	rc_remove_instruction(inst);
288 }
289 
transform_DPH(struct radeon_compiler * c,struct rc_instruction * inst)290 static void transform_DPH(struct radeon_compiler* c,
291 	struct rc_instruction* inst)
292 {
293 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
294 	src0.Negate &= ~RC_MASK_W;
295 	src0.Swizzle &= ~(7 << (3 * 3));
296 	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297 	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298 	rc_remove_instruction(inst);
299 }
300 
301 /**
302  * [1, src0.y*src1.y, src0.z, src1.w]
303  * So basically MUL with lotsa swizzling.
304  */
transform_DST(struct radeon_compiler * c,struct rc_instruction * inst)305 static void transform_DST(struct radeon_compiler* c,
306 	struct rc_instruction* inst)
307 {
308 	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311 	rc_remove_instruction(inst);
312 }
313 
transform_FLR(struct radeon_compiler * c,struct rc_instruction * inst)314 static void transform_FLR(struct radeon_compiler* c,
315 	struct rc_instruction* inst)
316 {
317 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319 	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320 		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321 	rc_remove_instruction(inst);
322 }
323 
transform_TRUNC(struct radeon_compiler * c,struct rc_instruction * inst)324 static void transform_TRUNC(struct radeon_compiler* c,
325 	struct rc_instruction* inst)
326 {
327 	/* Definition of trunc:
328 	 *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
329 	 *
330 	 * The multiplication by sgn(x) can be simplified using CMP:
331 	 *   y * sgn(x) = (x < 0 ? -y : y)
332 	 */
333 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
334 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
335 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
336 	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
337 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
338 	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
339 	rc_remove_instruction(inst);
340 }
341 
342 /**
343  * Definition of LIT (from ARB_fragment_program):
344  *
345  *  tmp = VectorLoad(op0);
346  *  if (tmp.x < 0) tmp.x = 0;
347  *  if (tmp.y < 0) tmp.y = 0;
348  *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349  *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
350  *  result.x = 1.0;
351  *  result.y = tmp.x;
352  *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
353  *  result.w = 1.0;
354  *
355  * The longest path of computation is the one leading to result.z,
356  * consisting of 5 operations. This implementation of LIT takes
357  * 5 slots, if the subsequent optimization passes are clever enough
358  * to pair instructions correctly.
359  */
transform_LIT(struct radeon_compiler * c,struct rc_instruction * inst)360 static void transform_LIT(struct radeon_compiler* c,
361 	struct rc_instruction* inst)
362 {
363 	unsigned int constant;
364 	unsigned int constant_swizzle;
365 	unsigned int temp;
366 	struct rc_src_register srctemp;
367 
368 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
369 
370 	if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
371 		struct rc_instruction * inst_mov;
372 
373 		inst_mov = emit1(c, inst,
374 			RC_OPCODE_MOV, 0, inst->U.I.DstReg,
375 			srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
376 
377 		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
378 		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
379 		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
380 	}
381 
382 	temp = inst->U.I.DstReg.Index;
383 	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
384 
385 	/* tmp.x = max(0.0, Src.x); */
386 	/* tmp.y = max(0.0, Src.y); */
387 	/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
389 		dstregtmpmask(temp, RC_MASK_XYW),
390 		inst->U.I.SrcReg[0],
391 		swizzle(srcreg(RC_FILE_CONSTANT, constant),
392 			RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
393 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
394 		dstregtmpmask(temp, RC_MASK_Z),
395 		swizzle_wwww(srctemp),
396 		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
397 
398 	/* tmp.w = Pow(tmp.y, tmp.w) */
399 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
400 		dstregtmpmask(temp, RC_MASK_W),
401 		swizzle_yyyy(srctemp));
402 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
403 		dstregtmpmask(temp, RC_MASK_W),
404 		swizzle_wwww(srctemp),
405 		swizzle_zzzz(srctemp));
406 	emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
407 		dstregtmpmask(temp, RC_MASK_W),
408 		swizzle_wwww(srctemp));
409 
410 	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
412 		dstregtmpmask(temp, RC_MASK_Z),
413 		negate(swizzle_xxxx(srctemp)),
414 		swizzle_wwww(srctemp),
415 		builtin_zero);
416 
417 	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
419 		dstregtmpmask(temp, RC_MASK_XYW),
420 		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
421 
422 	rc_remove_instruction(inst);
423 }
424 
transform_LRP(struct radeon_compiler * c,struct rc_instruction * inst)425 static void transform_LRP(struct radeon_compiler* c,
426 	struct rc_instruction* inst)
427 {
428 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429 
430 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
431 		dst,
432 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
433 	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
434 		inst->U.I.DstReg,
435 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
436 
437 	rc_remove_instruction(inst);
438 }
439 
transform_POW(struct radeon_compiler * c,struct rc_instruction * inst)440 static void transform_POW(struct radeon_compiler* c,
441 	struct rc_instruction* inst)
442 {
443 	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
444 	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
445 	tempdst.WriteMask = RC_MASK_W;
446 	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
447 
448 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
449 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
450 	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
451 
452 	rc_remove_instruction(inst);
453 }
454 
455 /* dst = ROUND(src) :
456  *   add = src + .5
457  *   frac = FRC(add)
458  *   dst = add - frac
459  *
460  * According to the GLSL spec, the implementor can decide which way to round
461  * when the fraction is .5.  We round down for .5.
462  *
463  */
transform_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)464 static void transform_ROUND(struct radeon_compiler* c,
465 	struct rc_instruction* inst)
466 {
467 	unsigned int mask = inst->U.I.DstReg.WriteMask;
468 	unsigned int frac_index, add_index;
469 	struct rc_dst_register frac_dst, add_dst;
470 	struct rc_src_register frac_src, add_src;
471 
472 	/* add = src + .5 */
473 	add_index = rc_find_free_temporary(c);
474 	add_dst = dstregtmpmask(add_index, mask);
475 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
476 								builtin_half);
477 	add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
478 
479 
480 	/* frac = FRC(add) */
481 	frac_index = rc_find_free_temporary(c);
482 	frac_dst = dstregtmpmask(frac_index, mask);
483 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
484 	frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
485 
486 	/* dst = add - frac */
487 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
488 						add_src, negate(frac_src));
489 	rc_remove_instruction(inst);
490 }
491 
transform_RSQ(struct radeon_compiler * c,struct rc_instruction * inst)492 static void transform_RSQ(struct radeon_compiler* c,
493 	struct rc_instruction* inst)
494 {
495 	inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
496 }
497 
transform_SEQ(struct radeon_compiler * c,struct rc_instruction * inst)498 static void transform_SEQ(struct radeon_compiler* c,
499 	struct rc_instruction* inst)
500 {
501 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
502 
503 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
504 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
505 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
506 
507 	rc_remove_instruction(inst);
508 }
509 
transform_SFL(struct radeon_compiler * c,struct rc_instruction * inst)510 static void transform_SFL(struct radeon_compiler* c,
511 	struct rc_instruction* inst)
512 {
513 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
514 	rc_remove_instruction(inst);
515 }
516 
transform_SGE(struct radeon_compiler * c,struct rc_instruction * inst)517 static void transform_SGE(struct radeon_compiler* c,
518 	struct rc_instruction* inst)
519 {
520 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
521 
522 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
523 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
524 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
525 
526 	rc_remove_instruction(inst);
527 }
528 
transform_SGT(struct radeon_compiler * c,struct rc_instruction * inst)529 static void transform_SGT(struct radeon_compiler* c,
530 	struct rc_instruction* inst)
531 {
532 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
533 
534 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
535 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
536 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
537 
538 	rc_remove_instruction(inst);
539 }
540 
transform_SLE(struct radeon_compiler * c,struct rc_instruction * inst)541 static void transform_SLE(struct radeon_compiler* c,
542 	struct rc_instruction* inst)
543 {
544 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
545 
546 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
547 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
548 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
549 
550 	rc_remove_instruction(inst);
551 }
552 
transform_SLT(struct radeon_compiler * c,struct rc_instruction * inst)553 static void transform_SLT(struct radeon_compiler* c,
554 	struct rc_instruction* inst)
555 {
556 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
557 
558 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
559 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
560 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
561 
562 	rc_remove_instruction(inst);
563 }
564 
transform_SNE(struct radeon_compiler * c,struct rc_instruction * inst)565 static void transform_SNE(struct radeon_compiler* c,
566 	struct rc_instruction* inst)
567 {
568 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
569 
570 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
571 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
572 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
573 
574 	rc_remove_instruction(inst);
575 }
576 
transform_SSG(struct radeon_compiler * c,struct rc_instruction * inst)577 static void transform_SSG(struct radeon_compiler* c,
578 	struct rc_instruction* inst)
579 {
580 	/* result = sign(x)
581 	 *
582 	 *   CMP tmp0, -x, 1, 0
583 	 *   CMP tmp1, x, 1, 0
584 	 *   ADD result, tmp0, -tmp1;
585 	 */
586 	struct rc_dst_register dst0;
587 	unsigned tmp1;
588 
589 	/* 0 < x */
590 	dst0 = try_to_reuse_dst(c, inst);
591 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
592 	      dst0,
593 	      negate(inst->U.I.SrcReg[0]),
594 	      builtin_one,
595 	      builtin_zero);
596 
597 	/* x < 0 */
598 	tmp1 = rc_find_free_temporary(c);
599 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
600 	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
601 	      inst->U.I.SrcReg[0],
602 	      builtin_one,
603 	      builtin_zero);
604 
605 	/* Either both are zero, or one of them is one and the other is zero. */
606 	/* result = tmp0 - tmp1 */
607 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
608 	      inst->U.I.DstReg,
609 	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
610 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
611 
612 	rc_remove_instruction(inst);
613 }
614 
transform_SUB(struct radeon_compiler * c,struct rc_instruction * inst)615 static void transform_SUB(struct radeon_compiler* c,
616 	struct rc_instruction* inst)
617 {
618 	inst->U.I.Opcode = RC_OPCODE_ADD;
619 	inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
620 }
621 
transform_SWZ(struct radeon_compiler * c,struct rc_instruction * inst)622 static void transform_SWZ(struct radeon_compiler* c,
623 	struct rc_instruction* inst)
624 {
625 	inst->U.I.Opcode = RC_OPCODE_MOV;
626 }
627 
transform_XPD(struct radeon_compiler * c,struct rc_instruction * inst)628 static void transform_XPD(struct radeon_compiler* c,
629 	struct rc_instruction* inst)
630 {
631 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
632 
633 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
634 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
635 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
636 	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
637 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
638 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
639 		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
640 
641 	rc_remove_instruction(inst);
642 }
643 
644 
645 /**
646  * Can be used as a transformation for @ref radeonClauseLocalTransform,
647  * no userData necessary.
648  *
649  * Eliminates the following ALU instructions:
650  *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
651  * using:
652  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
653  *
654  * Transforms RSQ to Radeon's native RSQ by explicitly setting
655  * absolute value.
656  *
657  * @note should be applicable to R300 and R500 fragment programs.
658  */
radeonTransformALU(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)659 int radeonTransformALU(
660 	struct radeon_compiler * c,
661 	struct rc_instruction* inst,
662 	void* unused)
663 {
664 	switch(inst->U.I.Opcode) {
665 	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
666 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
667 	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
668 	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
669 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
670 	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
671 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
672 	case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
673 	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
674 	case RC_OPCODE_POW: transform_POW(c, inst); return 1;
675 	case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
676 	case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
677 	case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
678 	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
679 	case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
680 	case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
681 	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
682 	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
683 	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
684 	case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
685 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
686 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
687 	case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
688 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
689 	default:
690 		return 0;
691 	}
692 }
693 
694 
transform_r300_vertex_ABS(struct radeon_compiler * c,struct rc_instruction * inst)695 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
696 	struct rc_instruction* inst)
697 {
698 	/* Note: r500 can take absolute values, but r300 cannot. */
699 	inst->U.I.Opcode = RC_OPCODE_MAX;
700 	inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
701 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
702 }
703 
transform_r300_vertex_CMP(struct radeon_compiler * c,struct rc_instruction * inst)704 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
705 	struct rc_instruction* inst)
706 {
707 	/* There is no decent CMP available, so let's rig one up.
708 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
709 	 * The following sequence consumes zero to two temps and two extra slots
710 	 * (the second temp and the second slot is consumed by transform_LRP),
711 	 * but should be equivalent:
712 	 *
713 	 * SLT tmp0, src0, 0.0
714 	 * LRP dst, tmp0, src1, src2
715 	 *
716 	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
717 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
718 
719 	/* SLT tmp0, src0, 0.0 */
720 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
721 		dst,
722 		inst->U.I.SrcReg[0], builtin_zero);
723 
724 	/* LRP dst, tmp0, src1, src2 */
725 	transform_LRP(c,
726 		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
727 		      inst->U.I.DstReg,
728 		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
729 
730 	rc_remove_instruction(inst);
731 }
732 
transform_r300_vertex_DP2(struct radeon_compiler * c,struct rc_instruction * inst)733 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
734 	struct rc_instruction* inst)
735 {
736 	struct rc_instruction *next_inst = inst->Next;
737 	transform_DP2(c, inst);
738 	next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
739 }
740 
transform_r300_vertex_DP3(struct radeon_compiler * c,struct rc_instruction * inst)741 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
742 	struct rc_instruction* inst)
743 {
744 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
745 	struct rc_src_register src1 = inst->U.I.SrcReg[1];
746 	src0.Negate &= ~RC_MASK_W;
747 	src0.Swizzle &= ~(7 << (3 * 3));
748 	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
749 	src1.Negate &= ~RC_MASK_W;
750 	src1.Swizzle &= ~(7 << (3 * 3));
751 	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
752 	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
753 	rc_remove_instruction(inst);
754 }
755 
transform_r300_vertex_fix_LIT(struct radeon_compiler * c,struct rc_instruction * inst)756 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
757 	struct rc_instruction* inst)
758 {
759 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
760 	unsigned constant_swizzle;
761 	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
762 							 0.0000000000000000001,
763 							 &constant_swizzle);
764 
765 	/* MOV dst, src */
766 	dst.WriteMask = RC_MASK_XYZW;
767 	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
768 		dst,
769 		inst->U.I.SrcReg[0]);
770 
771 	/* MAX dst.y, src, 0.00...001 */
772 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
773 		dstregtmpmask(dst.Index, RC_MASK_Y),
774 		srcreg(RC_FILE_TEMPORARY, dst.Index),
775 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
776 
777 	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
778 }
779 
transform_r300_vertex_SEQ(struct radeon_compiler * c,struct rc_instruction * inst)780 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
781 	struct rc_instruction *inst)
782 {
783 	/* x = y  <==>  x >= y && y >= x */
784 	int tmp = rc_find_free_temporary(c);
785 
786 	/* x <= y */
787 	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
788 	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
789 	      inst->U.I.SrcReg[0],
790 	      inst->U.I.SrcReg[1]);
791 
792 	/* y <= x */
793 	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
794 	      inst->U.I.DstReg,
795 	      inst->U.I.SrcReg[1],
796 	      inst->U.I.SrcReg[0]);
797 
798 	/* x && y  =  x * y */
799 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
800 	      inst->U.I.DstReg,
801 	      srcreg(RC_FILE_TEMPORARY, tmp),
802 	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
803 
804 	rc_remove_instruction(inst);
805 }
806 
transform_r300_vertex_SNE(struct radeon_compiler * c,struct rc_instruction * inst)807 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
808 	struct rc_instruction *inst)
809 {
810 	/* x != y  <==>  x < y || y < x */
811 	int tmp = rc_find_free_temporary(c);
812 
813 	/* x < y */
814 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
815 	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
816 	      inst->U.I.SrcReg[0],
817 	      inst->U.I.SrcReg[1]);
818 
819 	/* y < x */
820 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
821 	      inst->U.I.DstReg,
822 	      inst->U.I.SrcReg[1],
823 	      inst->U.I.SrcReg[0]);
824 
825 	/* x || y  =  max(x, y) */
826 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
827 	      inst->U.I.DstReg,
828 	      srcreg(RC_FILE_TEMPORARY, tmp),
829 	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
830 
831 	rc_remove_instruction(inst);
832 }
833 
transform_r300_vertex_SGT(struct radeon_compiler * c,struct rc_instruction * inst)834 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
835 	struct rc_instruction* inst)
836 {
837 	/* x > y  <==>  -x < -y */
838 	inst->U.I.Opcode = RC_OPCODE_SLT;
839 	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
840 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
841 }
842 
transform_r300_vertex_SLE(struct radeon_compiler * c,struct rc_instruction * inst)843 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
844 	struct rc_instruction* inst)
845 {
846 	/* x <= y  <==>  -x >= -y */
847 	inst->U.I.Opcode = RC_OPCODE_SGE;
848 	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
849 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
850 }
851 
transform_r300_vertex_SSG(struct radeon_compiler * c,struct rc_instruction * inst)852 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
853 	struct rc_instruction* inst)
854 {
855 	/* result = sign(x)
856 	 *
857 	 *   SLT tmp0, 0, x;
858 	 *   SLT tmp1, x, 0;
859 	 *   ADD result, tmp0, -tmp1;
860 	 */
861 	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
862 	unsigned tmp1;
863 
864 	/* 0 < x */
865 	dst0 = try_to_reuse_dst(c, inst);
866 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
867 	      dst0,
868 	      builtin_zero,
869 	      inst->U.I.SrcReg[0]);
870 
871 	/* x < 0 */
872 	tmp1 = rc_find_free_temporary(c);
873 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
874 	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
875 	      inst->U.I.SrcReg[0],
876 	      builtin_zero);
877 
878 	/* Either both are zero, or one of them is one and the other is zero. */
879 	/* result = tmp0 - tmp1 */
880 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
881 	      inst->U.I.DstReg,
882 	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
883 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
884 
885 	rc_remove_instruction(inst);
886 }
887 
transform_vertex_TRUNC(struct radeon_compiler * c,struct rc_instruction * inst)888 static void transform_vertex_TRUNC(struct radeon_compiler* c,
889 	struct rc_instruction* inst)
890 {
891 	struct rc_instruction *next = inst->Next;
892 
893 	/* next->Prev is removed after each transformation and replaced
894 	 * by a new instruction. */
895 	transform_TRUNC(c, next->Prev);
896 	transform_r300_vertex_CMP(c, next->Prev);
897 }
898 
899 /**
900  * For use with rc_local_transform, this transforms non-native ALU
901  * instructions of the r300 up to r500 vertex engine.
902  */
r300_transform_vertex_alu(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)903 int r300_transform_vertex_alu(
904 	struct radeon_compiler * c,
905 	struct rc_instruction* inst,
906 	void* unused)
907 {
908 	switch(inst->U.I.Opcode) {
909 	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
910 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
911 	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
912 	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
913 	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
914 	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
915 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
916 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
917 	case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
918 	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
919 	case RC_OPCODE_SEQ:
920 		if (!c->is_r500) {
921 			transform_r300_vertex_SEQ(c, inst);
922 			return 1;
923 		}
924 		return 0;
925 	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
926 	case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
927 	case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
928 	case RC_OPCODE_SNE:
929 		if (!c->is_r500) {
930 			transform_r300_vertex_SNE(c, inst);
931 			return 1;
932 		}
933 		return 0;
934 	case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
935 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
936 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
937 	case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
938 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
939 	default:
940 		return 0;
941 	}
942 }
943 
sincos_constants(struct radeon_compiler * c,unsigned int * constants)944 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
945 {
946 	static const float SinCosConsts[2][4] = {
947 		{
948 			1.273239545,		/* 4/PI */
949 			-0.405284735,		/* -4/(PI*PI) */
950 			3.141592654,		/* PI */
951 			0.2225			/* weight */
952 		},
953 		{
954 			0.75,
955 			0.5,
956 			0.159154943,		/* 1/(2*PI) */
957 			6.283185307		/* 2*PI */
958 		}
959 	};
960 	int i;
961 
962 	for(i = 0; i < 2; ++i)
963 		constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
964 }
965 
966 /**
967  * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
968  *
969  * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970  * MAD tmp.x, tmp.y, |src|, tmp.x
971  * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972  * MAD dest, tmp.y, weight, tmp.x
973  */
sin_approx(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_dst_register dst,struct rc_src_register src,const unsigned int * constants)974 static void sin_approx(
975 	struct radeon_compiler* c, struct rc_instruction * inst,
976 	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
977 {
978 	unsigned int tempreg = rc_find_free_temporary(c);
979 
980 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
981 		swizzle_xxxx(src),
982 		srcreg(RC_FILE_CONSTANT, constants[0]));
983 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
984 		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
985 		absolute(swizzle_xxxx(src)),
986 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
987 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
988 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
989 		absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
990 		negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
991 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
992 		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
993 		swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
994 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
995 }
996 
997 /**
998  * Translate the trigonometric functions COS, SIN, and SCS
999  * using only the basic instructions
1000  *  MOV, ADD, MUL, MAD, FRC
1001  */
r300_transform_trig_simple(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1002 int r300_transform_trig_simple(struct radeon_compiler* c,
1003 	struct rc_instruction* inst,
1004 	void* unused)
1005 {
1006 	unsigned int constants[2];
1007 	unsigned int tempreg;
1008 
1009 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1010 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1011 	    inst->U.I.Opcode != RC_OPCODE_SCS)
1012 		return 0;
1013 
1014 	tempreg = rc_find_free_temporary(c);
1015 
1016 	sincos_constants(c, constants);
1017 
1018 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1019 		/* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020 		/* FRC tmp.x, tmp.x */
1021 		/* MAD tmp.z, tmp.x, 2*PI, -PI */
1022 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1023 			swizzle_xxxx(inst->U.I.SrcReg[0]),
1024 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1025 			swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
1026 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1027 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1028 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1029 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1030 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1031 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1032 
1033 		sin_approx(c, inst, inst->U.I.DstReg,
1034 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1035 			constants);
1036 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1037 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1038 			swizzle_xxxx(inst->U.I.SrcReg[0]),
1039 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1040 			swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1041 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1042 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1043 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1044 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1045 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1046 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1047 
1048 		sin_approx(c, inst, inst->U.I.DstReg,
1049 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1050 			constants);
1051 	} else {
1052 		struct rc_dst_register dst;
1053 
1054 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1055 			swizzle_xxxx(inst->U.I.SrcReg[0]),
1056 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1057 			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1058 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1059 			srcreg(RC_FILE_TEMPORARY, tempreg));
1060 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1061 			srcreg(RC_FILE_TEMPORARY, tempreg),
1062 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1063 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1064 
1065 		dst = inst->U.I.DstReg;
1066 
1067 		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1068 		sin_approx(c, inst, dst,
1069 			swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1070 			constants);
1071 
1072 		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1073 		sin_approx(c, inst, dst,
1074 			swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1075 			constants);
1076 	}
1077 
1078 	rc_remove_instruction(inst);
1079 
1080 	return 1;
1081 }
1082 
r300_transform_SIN_COS_SCS(struct radeon_compiler * c,struct rc_instruction * inst,unsigned srctmp)1083 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1084 	struct rc_instruction *inst,
1085 	unsigned srctmp)
1086 {
1087 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1088 		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1089 			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1090 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1091 		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1092 			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1093 	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1094 		struct rc_dst_register moddst = inst->U.I.DstReg;
1095 
1096 		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1097 			moddst.WriteMask = RC_MASK_X;
1098 			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1099 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1100 		}
1101 		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1102 			moddst.WriteMask = RC_MASK_Y;
1103 			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1104 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1105 		}
1106 	}
1107 
1108 	rc_remove_instruction(inst);
1109 }
1110 
1111 
1112 /**
1113  * Transform the trigonometric functions COS, SIN, and SCS
1114  * to include pre-scaling by 1/(2*PI) and taking the fractional
1115  * part, so that the input to COS and SIN is always in the range [0,1).
1116  * SCS is replaced by one COS and one SIN instruction.
1117  *
1118  * @warning This transformation implicitly changes the semantics of SIN and COS!
1119  */
radeonTransformTrigScale(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1120 int radeonTransformTrigScale(struct radeon_compiler* c,
1121 	struct rc_instruction* inst,
1122 	void* unused)
1123 {
1124 	static const float RCP_2PI = 0.15915494309189535;
1125 	unsigned int temp;
1126 	unsigned int constant;
1127 	unsigned int constant_swizzle;
1128 
1129 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1131 	    inst->U.I.Opcode != RC_OPCODE_SCS)
1132 		return 0;
1133 
1134 	temp = rc_find_free_temporary(c);
1135 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1136 
1137 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1138 		swizzle_xxxx(inst->U.I.SrcReg[0]),
1139 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1140 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1141 		srcreg(RC_FILE_TEMPORARY, temp));
1142 
1143 	r300_transform_SIN_COS_SCS(c, inst, temp);
1144 	return 1;
1145 }
1146 
1147 /**
1148  * Transform the trigonometric functions COS, SIN, and SCS
1149  * so that the input to COS and SIN is always in the range [-PI, PI].
1150  * SCS is replaced by one COS and one SIN instruction.
1151  */
r300_transform_trig_scale_vertex(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1152 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1153 	struct rc_instruction *inst,
1154 	void *unused)
1155 {
1156 	static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1157 	unsigned int temp;
1158 	unsigned int constant;
1159 
1160 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1161 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1162 	    inst->U.I.Opcode != RC_OPCODE_SCS)
1163 		return 0;
1164 
1165 	/* Repeat x in the range [-PI, PI]:
1166 	 *
1167 	 *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1168 	 */
1169 
1170 	temp = rc_find_free_temporary(c);
1171 	constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1172 
1173 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1174 		swizzle_xxxx(inst->U.I.SrcReg[0]),
1175 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1176 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1177 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1178 		srcreg(RC_FILE_TEMPORARY, temp));
1179 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1180 		srcreg(RC_FILE_TEMPORARY, temp),
1181 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1182 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1183 
1184 	r300_transform_SIN_COS_SCS(c, inst, temp);
1185 	return 1;
1186 }
1187 
1188 /**
1189  * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190  * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191  * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1192  *
1193  * @warning This explicitly changes the form of DDX and DDY!
1194  */
1195 
radeonTransformDeriv(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)1196 int radeonTransformDeriv(struct radeon_compiler* c,
1197 	struct rc_instruction* inst,
1198 	void* unused)
1199 {
1200 	if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1201 		return 0;
1202 
1203 	inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1204 	inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1205 
1206 	return 1;
1207 }
1208 
1209 /**
1210  * IF Temp[0].x -> IF Temp[0].x
1211  * ...          -> ...
1212  * KILP         -> KIL -abs(Temp[0].x)
1213  * ...          -> ...
1214  * ENDIF        -> ENDIF
1215  *
1216  * === OR ===
1217  *
1218  * IF Temp[0].x -\
1219  * KILP         - > KIL -abs(Temp[0].x)
1220  * ENDIF        -/
1221  *
1222  * === OR ===
1223  *
1224  * IF Temp[0].x -> IF Temp[0].x
1225  * ...          -> ...
1226  * ELSE         -> ELSE
1227  * ...	        -> ...
1228  * KILP	        -> KIL -abs(Temp[0].x)
1229  * ...          -> ...
1230  * ENDIF        -> ENDIF
1231  *
1232  * === OR ===
1233  *
1234  * KILP         -> KIL -none.1111
1235  *
1236  * This needs to be done in its own pass, because it might modify the
1237  * instructions before and after KILP.
1238  */
rc_transform_KILP(struct radeon_compiler * c,void * user)1239 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1240 {
1241 	struct rc_instruction * inst;
1242 	for (inst = c->Program.Instructions.Next;
1243 			inst != &c->Program.Instructions; inst = inst->Next) {
1244 		struct rc_instruction * if_inst;
1245 		unsigned in_if = 0;
1246 
1247 		if (inst->U.I.Opcode != RC_OPCODE_KILP)
1248 			continue;
1249 
1250 		for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1251 						if_inst = if_inst->Prev) {
1252 
1253 			if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1254 				in_if = 1;
1255 				break;
1256 			}
1257 		}
1258 
1259 		inst->U.I.Opcode = RC_OPCODE_KIL;
1260 
1261 		if (!in_if) {
1262 			inst->U.I.SrcReg[0] = negate(builtin_one);
1263 		} else {
1264 			/* This should work even if the KILP is inside the ELSE
1265 			 * block, because -0.0 is considered negative. */
1266 			inst->U.I.SrcReg[0] =
1267 				negate(absolute(if_inst->U.I.SrcReg[0]));
1268 
1269 			if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1270 				&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1271 
1272 				/* Optimize the special case:
1273 				 * IF Temp[0].x
1274 				 * KILP
1275 				 * ENDIF
1276 				 */
1277 
1278 				/* Remove IF */
1279 				rc_remove_instruction(inst->Prev);
1280 				/* Remove ENDIF */
1281 				rc_remove_instruction(inst->Next);
1282 			}
1283 		}
1284 	}
1285 }
1286