1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28 
29 #include "sb/sb_public.h"
30 
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41 
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44 
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62 
63 /* Contents of r0 on entry to various shaders
64 
65  VS - .x = VertexID
66       .y = RelVertexID (??)
67       .w = InstanceID
68 
69  GS - r0.xyw, r1.xyz = per-vertex offsets
70       r0.z = PrimitiveID
71 
72  TCS - .x = PatchID
73        .y = RelPatchID (??)
74        .z = InvocationID
75        .w = tess factor base.
76 
77  TES - .x = TessCoord.x
78      - .y = TessCoord.y
79      - .z = RelPatchID (??)
80      - .w = PrimitiveID
81 
82  PS - face_gpr.z = SampleMask
83       face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 				 struct r600_pipe_shader *pipeshader,
88 				 union r600_shader_key key);
89 
r600_add_gpr_array(struct r600_shader * ps,int start_gpr,int size,unsigned comp_mask)90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91                            int size, unsigned comp_mask) {
92 
93 	if (!size)
94 		return;
95 
96 	if (ps->num_arrays == ps->max_arrays) {
97 		ps->max_arrays += 64;
98 		ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 		                     sizeof(struct r600_shader_array));
100 	}
101 
102 	int n = ps->num_arrays;
103 	++ps->num_arrays;
104 
105 	ps->arrays[n].comp_mask = comp_mask;
106 	ps->arrays[n].gpr_start = start_gpr;
107 	ps->arrays[n].gpr_count = size;
108 }
109 
r600_dump_streamout(struct pipe_stream_output_info * so)110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 	unsigned i;
113 
114 	fprintf(stderr, "STREAMOUT\n");
115 	for (i = 0; i < so->num_outputs; i++) {
116 		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 				so->output[i].start_component;
118 		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 			i,
120 			so->output[i].stream,
121 			so->output[i].output_buffer,
122 			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 			so->output[i].register_index,
124 			mask & 1 ? "x" : "",
125 		        mask & 2 ? "y" : "",
126 		        mask & 4 ? "z" : "",
127 		        mask & 8 ? "w" : "",
128 			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 	}
130 }
131 
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)132 static int store_shader(struct pipe_context *ctx,
133 			struct r600_pipe_shader *shader)
134 {
135 	struct r600_context *rctx = (struct r600_context *)ctx;
136 	uint32_t *ptr, i;
137 
138 	if (shader->bo == NULL) {
139 		shader->bo = (struct r600_resource*)
140 			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 		if (shader->bo == NULL) {
142 			return -ENOMEM;
143 		}
144 		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 		if (R600_BIG_ENDIAN) {
146 			for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 			}
149 		} else {
150 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 		}
152 		rctx->b.ws->buffer_unmap(shader->bo->buf);
153 	}
154 
155 	return 0;
156 }
157 
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 			    struct r600_pipe_shader *shader,
160 			    union r600_shader_key key)
161 {
162 	struct r600_context *rctx = (struct r600_context *)ctx;
163 	struct r600_pipe_shader_selector *sel = shader->selector;
164 	int r;
165 	bool dump = r600_can_dump_shader(&rctx->screen->b,
166 					 tgsi_get_processor_type(sel->tokens));
167 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 	unsigned export_shader;
170 
171 	shader->shader.bc.isa = rctx->isa;
172 
173 	if (dump) {
174 		fprintf(stderr, "--------------------------------------------------------------\n");
175 		tgsi_dump(sel->tokens, 0);
176 
177 		if (sel->so.num_outputs) {
178 			r600_dump_streamout(&sel->so);
179 		}
180 	}
181 	r = r600_shader_from_tgsi(rctx, shader, key);
182 	if (r) {
183 		R600_ERR("translation from TGSI failed !\n");
184 		goto error;
185 	}
186 	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 		/* only disable for vertex shaders in tess paths */
188 		if (key.vs.as_ls)
189 			use_sb = 0;
190 	}
191 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194 
195 	/* disable SB for shaders using doubles */
196 	use_sb &= !shader->shader.uses_doubles;
197 
198 	use_sb &= !shader->shader.uses_atomics;
199 	use_sb &= !shader->shader.uses_images;
200 
201 	/* Check if the bytecode has already been built. */
202 	if (!shader->shader.bc.bytecode) {
203 		r = r600_bytecode_build(&shader->shader.bc);
204 		if (r) {
205 			R600_ERR("building bytecode failed !\n");
206 			goto error;
207 		}
208 	}
209 
210 	if (dump && !sb_disasm) {
211 		fprintf(stderr, "--------------------------------------------------------------\n");
212 		r600_bytecode_disasm(&shader->shader.bc);
213 		fprintf(stderr, "______________________________________________________________\n");
214 	} else if ((dump && sb_disasm) || use_sb) {
215 		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
216 		                             dump, use_sb);
217 		if (r) {
218 			R600_ERR("r600_sb_bytecode_process failed !\n");
219 			goto error;
220 		}
221 	}
222 
223 	if (shader->gs_copy_shader) {
224 		if (dump) {
225 			// dump copy shader
226 			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
227 						     &shader->gs_copy_shader->shader, dump, 0);
228 			if (r)
229 				goto error;
230 		}
231 
232 		if ((r = store_shader(ctx, shader->gs_copy_shader)))
233 			goto error;
234 	}
235 
236 	/* Store the shader in a buffer. */
237 	if ((r = store_shader(ctx, shader)))
238 		goto error;
239 
240 	/* Build state. */
241 	switch (shader->shader.processor_type) {
242 	case PIPE_SHADER_TESS_CTRL:
243 		evergreen_update_hs_state(ctx, shader);
244 		break;
245 	case PIPE_SHADER_TESS_EVAL:
246 		if (key.tes.as_es)
247 			evergreen_update_es_state(ctx, shader);
248 		else
249 			evergreen_update_vs_state(ctx, shader);
250 		break;
251 	case PIPE_SHADER_GEOMETRY:
252 		if (rctx->b.chip_class >= EVERGREEN) {
253 			evergreen_update_gs_state(ctx, shader);
254 			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
255 		} else {
256 			r600_update_gs_state(ctx, shader);
257 			r600_update_vs_state(ctx, shader->gs_copy_shader);
258 		}
259 		break;
260 	case PIPE_SHADER_VERTEX:
261 		export_shader = key.vs.as_es;
262 		if (rctx->b.chip_class >= EVERGREEN) {
263 			if (key.vs.as_ls)
264 				evergreen_update_ls_state(ctx, shader);
265 			else if (key.vs.as_es)
266 				evergreen_update_es_state(ctx, shader);
267 			else
268 				evergreen_update_vs_state(ctx, shader);
269 		} else {
270 			if (export_shader)
271 				r600_update_es_state(ctx, shader);
272 			else
273 				r600_update_vs_state(ctx, shader);
274 		}
275 		break;
276 	case PIPE_SHADER_FRAGMENT:
277 		if (rctx->b.chip_class >= EVERGREEN) {
278 			evergreen_update_ps_state(ctx, shader);
279 		} else {
280 			r600_update_ps_state(ctx, shader);
281 		}
282 		break;
283 	case PIPE_SHADER_COMPUTE:
284 		evergreen_update_ls_state(ctx, shader);
285 		break;
286 	default:
287 		r = -EINVAL;
288 		goto error;
289 	}
290 	return 0;
291 
292 error:
293 	r600_pipe_shader_destroy(ctx, shader);
294 	return r;
295 }
296 
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)297 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
298 {
299 	r600_resource_reference(&shader->bo, NULL);
300 	r600_bytecode_clear(&shader->shader.bc);
301 	r600_release_command_buffer(&shader->command_buffer);
302 }
303 
304 /*
305  * tgsi -> r600 shader
306  */
307 struct r600_shader_tgsi_instruction;
308 
309 struct r600_shader_src {
310 	unsigned				sel;
311 	unsigned				swizzle[4];
312 	unsigned				neg;
313 	unsigned				abs;
314 	unsigned				rel;
315 	unsigned				kc_bank;
316 	boolean					kc_rel; /* true if cache bank is indexed */
317 	uint32_t				value[4];
318 };
319 
320 struct eg_interp {
321 	boolean					enabled;
322 	unsigned				ij_index;
323 };
324 
325 struct r600_shader_ctx {
326 	struct tgsi_shader_info			info;
327 	struct tgsi_parse_context		parse;
328 	const struct tgsi_token			*tokens;
329 	unsigned				type;
330 	unsigned				file_offset[TGSI_FILE_COUNT];
331 	unsigned				temp_reg;
332 	const struct r600_shader_tgsi_instruction	*inst_info;
333 	struct r600_bytecode			*bc;
334 	struct r600_shader			*shader;
335 	struct r600_shader_src			src[4];
336 	uint32_t				*literals;
337 	uint32_t				nliterals;
338 	uint32_t				max_driver_temp_used;
339 	/* needed for evergreen interpolation */
340 	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
341 	/* evergreen/cayman also store sample mask in face register */
342 	int					face_gpr;
343 	/* sample id is .w component stored in fixed point position register */
344 	int					fixed_pt_position_gpr;
345 	int					colors_used;
346 	boolean                 clip_vertex_write;
347 	unsigned                cv_output;
348 	unsigned		edgeflag_output;
349 	int                                     cs_block_size_reg;
350 	int                                     cs_grid_size_reg;
351 	bool cs_block_size_loaded, cs_grid_size_loaded;
352 	int					fragcoord_input;
353 	int					next_ring_offset;
354 	int					gs_out_ring_offset;
355 	int					gs_next_vertex;
356 	struct r600_shader	*gs_for_vs;
357 	int					gs_export_gpr_tregs[4];
358 	int                                     gs_rotated_input[2];
359 	const struct pipe_stream_output_info	*gs_stream_output_info;
360 	unsigned				enabled_stream_buffers_mask;
361 	unsigned                                tess_input_info; /* temp with tess input offsets */
362 	unsigned                                tess_output_info; /* temp with tess input offsets */
363 	unsigned                                thread_id_gpr; /* temp with thread id calculated for images */
364 	bool thread_id_gpr_loaded;
365 };
366 
367 struct r600_shader_tgsi_instruction {
368 	unsigned	op;
369 	int (*process)(struct r600_shader_ctx *ctx);
370 };
371 
372 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
373 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
374 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
375 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
376 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
377 static int tgsi_else(struct r600_shader_ctx *ctx);
378 static int tgsi_endif(struct r600_shader_ctx *ctx);
379 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
380 static int tgsi_endloop(struct r600_shader_ctx *ctx);
381 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
382 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
383                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
384                                 unsigned int dst_reg);
385 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
386 			const struct r600_shader_src *shader_src,
387 			unsigned chan);
388 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
389 			       unsigned dst_reg, unsigned mask);
390 
ctx_needs_stack_workaround_8xx(struct r600_shader_ctx * ctx)391 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
392 {
393 	if (ctx->bc->family == CHIP_HEMLOCK ||
394 	    ctx->bc->family == CHIP_CYPRESS ||
395 	    ctx->bc->family == CHIP_JUNIPER)
396 		return false;
397 	return true;
398 }
399 
tgsi_last_instruction(unsigned writemask)400 static int tgsi_last_instruction(unsigned writemask)
401 {
402 	int i, lasti = 0;
403 
404 	for (i = 0; i < 4; i++) {
405 		if (writemask & (1 << i)) {
406 			lasti = i;
407 		}
408 	}
409 	return lasti;
410 }
411 
tgsi_is_supported(struct r600_shader_ctx * ctx)412 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
413 {
414 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
415 	unsigned j;
416 
417 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
418 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
419 		return -EINVAL;
420 	}
421 #if 0
422 	if (i->Instruction.Label) {
423 		R600_ERR("label unsupported\n");
424 		return -EINVAL;
425 	}
426 #endif
427 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
428 		if (i->Src[j].Register.Dimension) {
429 		   switch (i->Src[j].Register.File) {
430 		   case TGSI_FILE_CONSTANT:
431 		   case TGSI_FILE_HW_ATOMIC:
432 			   break;
433 		   case TGSI_FILE_INPUT:
434 			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
435 			       ctx->type == PIPE_SHADER_TESS_CTRL ||
436 			       ctx->type == PIPE_SHADER_TESS_EVAL)
437 				   break;
438 		   case TGSI_FILE_OUTPUT:
439 			   if (ctx->type == PIPE_SHADER_TESS_CTRL)
440 				   break;
441 		   default:
442 			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
443 				    i->Src[j].Register.File,
444 				    i->Src[j].Register.Dimension);
445 			   return -EINVAL;
446 		   }
447 		}
448 	}
449 	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
450 		if (i->Dst[j].Register.Dimension) {
451 			if (ctx->type == PIPE_SHADER_TESS_CTRL)
452 				continue;
453 			R600_ERR("unsupported dst (dimension)\n");
454 			return -EINVAL;
455 		}
456 	}
457 	return 0;
458 }
459 
eg_get_interpolator_index(unsigned interpolate,unsigned location)460 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
461 {
462 	if (interpolate == TGSI_INTERPOLATE_COLOR ||
463 		interpolate == TGSI_INTERPOLATE_LINEAR ||
464 		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
465 	{
466 		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
467 		int loc;
468 
469 		switch(location) {
470 		case TGSI_INTERPOLATE_LOC_CENTER:
471 			loc = 1;
472 			break;
473 		case TGSI_INTERPOLATE_LOC_CENTROID:
474 			loc = 2;
475 			break;
476 		case TGSI_INTERPOLATE_LOC_SAMPLE:
477 		default:
478 			loc = 0; break;
479 		}
480 
481 		return is_linear * 3 + loc;
482 	}
483 
484 	return -1;
485 }
486 
evergreen_interp_assign_ij_index(struct r600_shader_ctx * ctx,int input)487 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
488 		int input)
489 {
490 	int i = eg_get_interpolator_index(
491 		ctx->shader->input[input].interpolate,
492 		ctx->shader->input[input].interpolate_location);
493 	assert(i >= 0);
494 	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
495 }
496 
evergreen_interp_alu(struct r600_shader_ctx * ctx,int input)497 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
498 {
499 	int i, r;
500 	struct r600_bytecode_alu alu;
501 	int gpr = 0, base_chan = 0;
502 	int ij_index = ctx->shader->input[input].ij_index;
503 
504 	/* work out gpr and base_chan from index */
505 	gpr = ij_index / 2;
506 	base_chan = (2 * (ij_index % 2)) + 1;
507 
508 	for (i = 0; i < 8; i++) {
509 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
510 
511 		if (i < 4)
512 			alu.op = ALU_OP2_INTERP_ZW;
513 		else
514 			alu.op = ALU_OP2_INTERP_XY;
515 
516 		if ((i > 1) && (i < 6)) {
517 			alu.dst.sel = ctx->shader->input[input].gpr;
518 			alu.dst.write = 1;
519 		}
520 
521 		alu.dst.chan = i % 4;
522 
523 		alu.src[0].sel = gpr;
524 		alu.src[0].chan = (base_chan - (i % 2));
525 
526 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
527 
528 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
529 		if ((i % 4) == 3)
530 			alu.last = 1;
531 		r = r600_bytecode_add_alu(ctx->bc, &alu);
532 		if (r)
533 			return r;
534 	}
535 	return 0;
536 }
537 
evergreen_interp_flat(struct r600_shader_ctx * ctx,int input)538 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
539 {
540 	int i, r;
541 	struct r600_bytecode_alu alu;
542 
543 	for (i = 0; i < 4; i++) {
544 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
545 
546 		alu.op = ALU_OP1_INTERP_LOAD_P0;
547 
548 		alu.dst.sel = ctx->shader->input[input].gpr;
549 		alu.dst.write = 1;
550 
551 		alu.dst.chan = i;
552 
553 		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
554 		alu.src[0].chan = i;
555 
556 		if (i == 3)
557 			alu.last = 1;
558 		r = r600_bytecode_add_alu(ctx->bc, &alu);
559 		if (r)
560 			return r;
561 	}
562 	return 0;
563 }
564 
565 /*
566  * Special export handling in shaders
567  *
568  * shader export ARRAY_BASE for EXPORT_POS:
569  * 60 is position
570  * 61 is misc vector
571  * 62, 63 are clip distance vectors
572  *
573  * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
574  * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
575  * USE_VTX_POINT_SIZE - point size in the X channel of export 61
576  * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
577  * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
578  * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
579  * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
580  * exclusive from render target index)
581  * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
582  *
583  *
584  * shader export ARRAY_BASE for EXPORT_PIXEL:
585  * 0-7 CB targets
586  * 61 computed Z vector
587  *
588  * The use of the values exported in the computed Z vector are controlled
589  * by DB_SHADER_CONTROL:
590  * Z_EXPORT_ENABLE - Z as a float in RED
591  * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
592  * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
593  * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
594  * DB_SOURCE_FORMAT - export control restrictions
595  *
596  */
597 
598 
599 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
r600_spi_sid(struct r600_shader_io * io)600 static int r600_spi_sid(struct r600_shader_io * io)
601 {
602 	int index, name = io->name;
603 
604 	/* These params are handled differently, they don't need
605 	 * semantic indices, so we'll use 0 for them.
606 	 */
607 	if (name == TGSI_SEMANTIC_POSITION ||
608 	    name == TGSI_SEMANTIC_PSIZE ||
609 	    name == TGSI_SEMANTIC_EDGEFLAG ||
610 	    name == TGSI_SEMANTIC_FACE ||
611 	    name == TGSI_SEMANTIC_SAMPLEMASK)
612 		index = 0;
613 	else {
614 		if (name == TGSI_SEMANTIC_GENERIC) {
615 			/* For generic params simply use sid from tgsi */
616 			index = io->sid;
617 		} else {
618 			/* For non-generic params - pack name and sid into 8 bits */
619 			index = 0x80 | (name<<3) | (io->sid);
620 		}
621 
622 		/* Make sure that all really used indices have nonzero value, so
623 		 * we can just compare it to 0 later instead of comparing the name
624 		 * with different values to detect special cases. */
625 		index++;
626 	}
627 
628 	return index;
629 };
630 
631 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)632 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
633 {
634 	switch (semantic_name) {
635 	case TGSI_SEMANTIC_POSITION:
636 		return 0;
637 	case TGSI_SEMANTIC_PSIZE:
638 		return 1;
639 	case TGSI_SEMANTIC_CLIPDIST:
640 		assert(index <= 1);
641 		return 2 + index;
642 	case TGSI_SEMANTIC_GENERIC:
643 		if (index <= 63-4)
644 			return 4 + index - 9;
645 		else
646 			/* same explanation as in the default statement,
647 			 * the only user hitting this is st/nine.
648 			 */
649 			return 0;
650 
651 	/* patch indices are completely separate and thus start from 0 */
652 	case TGSI_SEMANTIC_TESSOUTER:
653 		return 0;
654 	case TGSI_SEMANTIC_TESSINNER:
655 		return 1;
656 	case TGSI_SEMANTIC_PATCH:
657 		return 2 + index;
658 
659 	default:
660 		/* Don't fail here. The result of this function is only used
661 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
662 		 * occur, but this function is called for all vertex shaders
663 		 * before it's known whether LS will be compiled or not.
664 		 */
665 		return 0;
666 	}
667 }
668 
669 /* turn input into interpolate on EG */
evergreen_interp_input(struct r600_shader_ctx * ctx,int index)670 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
671 {
672 	int r = 0;
673 
674 	if (ctx->shader->input[index].spi_sid) {
675 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
676 		if (ctx->shader->input[index].interpolate > 0) {
677 			evergreen_interp_assign_ij_index(ctx, index);
678 			r = evergreen_interp_alu(ctx, index);
679 		} else {
680 			r = evergreen_interp_flat(ctx, index);
681 		}
682 	}
683 	return r;
684 }
685 
select_twoside_color(struct r600_shader_ctx * ctx,int front,int back)686 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
687 {
688 	struct r600_bytecode_alu alu;
689 	int i, r;
690 	int gpr_front = ctx->shader->input[front].gpr;
691 	int gpr_back = ctx->shader->input[back].gpr;
692 
693 	for (i = 0; i < 4; i++) {
694 		memset(&alu, 0, sizeof(alu));
695 		alu.op = ALU_OP3_CNDGT;
696 		alu.is_op3 = 1;
697 		alu.dst.write = 1;
698 		alu.dst.sel = gpr_front;
699 		alu.src[0].sel = ctx->face_gpr;
700 		alu.src[1].sel = gpr_front;
701 		alu.src[2].sel = gpr_back;
702 
703 		alu.dst.chan = i;
704 		alu.src[1].chan = i;
705 		alu.src[2].chan = i;
706 		alu.last = (i==3);
707 
708 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
709 			return r;
710 	}
711 
712 	return 0;
713 }
714 
715 /* execute a single slot ALU calculation */
single_alu_op2(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val)716 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
717 			  int dst_sel, int dst_chan,
718 			  int src0_sel, unsigned src0_chan_val,
719 			  int src1_sel, unsigned src1_chan_val)
720 {
721 	struct r600_bytecode_alu alu;
722 	int r, i;
723 
724 	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
725 		for (i = 0; i < 4; i++) {
726 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
727 			alu.op = op;
728 			alu.src[0].sel = src0_sel;
729 			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
730 				alu.src[0].value = src0_chan_val;
731 			else
732 				alu.src[0].chan = src0_chan_val;
733 			alu.src[1].sel = src1_sel;
734 			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
735 				alu.src[1].value = src1_chan_val;
736 			else
737 				alu.src[1].chan = src1_chan_val;
738 			alu.dst.sel = dst_sel;
739 			alu.dst.chan = i;
740 			alu.dst.write = i == dst_chan;
741 			alu.last = (i == 3);
742 			r = r600_bytecode_add_alu(ctx->bc, &alu);
743 			if (r)
744 				return r;
745 		}
746 		return 0;
747 	}
748 
749 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
750 	alu.op = op;
751 	alu.src[0].sel = src0_sel;
752 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
753 		alu.src[0].value = src0_chan_val;
754 	else
755 		alu.src[0].chan = src0_chan_val;
756 	alu.src[1].sel = src1_sel;
757 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
758 		alu.src[1].value = src1_chan_val;
759 	else
760 		alu.src[1].chan = src1_chan_val;
761 	alu.dst.sel = dst_sel;
762 	alu.dst.chan = dst_chan;
763 	alu.dst.write = 1;
764 	alu.last = 1;
765 	r = r600_bytecode_add_alu(ctx->bc, &alu);
766 	if (r)
767 		return r;
768 	return 0;
769 }
770 
771 /* execute a single slot ALU calculation */
single_alu_op3(struct r600_shader_ctx * ctx,int op,int dst_sel,int dst_chan,int src0_sel,unsigned src0_chan_val,int src1_sel,unsigned src1_chan_val,int src2_sel,unsigned src2_chan_val)772 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
773 			  int dst_sel, int dst_chan,
774 			  int src0_sel, unsigned src0_chan_val,
775 			  int src1_sel, unsigned src1_chan_val,
776 			  int src2_sel, unsigned src2_chan_val)
777 {
778 	struct r600_bytecode_alu alu;
779 	int r;
780 
781 	/* validate this for other ops */
782 	assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
783 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
784 	alu.op = op;
785 	alu.src[0].sel = src0_sel;
786 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
787 		alu.src[0].value = src0_chan_val;
788 	else
789 		alu.src[0].chan = src0_chan_val;
790 	alu.src[1].sel = src1_sel;
791 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
792 		alu.src[1].value = src1_chan_val;
793 	else
794 		alu.src[1].chan = src1_chan_val;
795 	alu.src[2].sel = src2_sel;
796 	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
797 		alu.src[2].value = src2_chan_val;
798 	else
799 		alu.src[2].chan = src2_chan_val;
800 	alu.dst.sel = dst_sel;
801 	alu.dst.chan = dst_chan;
802 	alu.is_op3 = 1;
803 	alu.last = 1;
804 	r = r600_bytecode_add_alu(ctx->bc, &alu);
805 	if (r)
806 		return r;
807 	return 0;
808 }
809 
810 /* put it in temp_reg.x */
get_lds_offset0(struct r600_shader_ctx * ctx,int rel_patch_chan,int temp_reg,bool is_patch_var)811 static int get_lds_offset0(struct r600_shader_ctx *ctx,
812 			   int rel_patch_chan,
813 			   int temp_reg, bool is_patch_var)
814 {
815 	int r;
816 
817 	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
818 	/* ADD
819 	   Dimension - patch0_offset (input_vals.z),
820 	   Non-dim - patch0_data_offset (input_vals.w)
821 	*/
822 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
823 			   temp_reg, 0,
824 			   ctx->tess_output_info, 0,
825 			   0, rel_patch_chan,
826 			   ctx->tess_output_info, is_patch_var ? 3 : 2);
827 	if (r)
828 		return r;
829 	return 0;
830 }
831 
get_address_file_reg(struct r600_shader_ctx * ctx,int index)832 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
833 {
834 	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
835 }
836 
r600_get_temp(struct r600_shader_ctx * ctx)837 static int r600_get_temp(struct r600_shader_ctx *ctx)
838 {
839 	return ctx->temp_reg + ctx->max_driver_temp_used++;
840 }
841 
vs_add_primid_output(struct r600_shader_ctx * ctx,int prim_id_sid)842 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
843 {
844 	int i;
845 	i = ctx->shader->noutput++;
846 	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
847 	ctx->shader->output[i].sid = 0;
848 	ctx->shader->output[i].gpr = 0;
849 	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
850 	ctx->shader->output[i].write_mask = 0x4;
851 	ctx->shader->output[i].spi_sid = prim_id_sid;
852 
853 	return 0;
854 }
855 
tgsi_barrier(struct r600_shader_ctx * ctx)856 static int tgsi_barrier(struct r600_shader_ctx *ctx)
857 {
858 	struct r600_bytecode_alu alu;
859 	int r;
860 
861 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
862 	alu.op = ctx->inst_info->op;
863 	alu.last = 1;
864 
865 	r = r600_bytecode_add_alu(ctx->bc, &alu);
866 	if (r)
867 		return r;
868 	return 0;
869 }
870 
tgsi_declaration(struct r600_shader_ctx * ctx)871 static int tgsi_declaration(struct r600_shader_ctx *ctx)
872 {
873 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
874 	int r, i, j, count = d->Range.Last - d->Range.First + 1;
875 
876 	switch (d->Declaration.File) {
877 	case TGSI_FILE_INPUT:
878 		for (j = 0; j < count; j++) {
879 			i = ctx->shader->ninput + j;
880 			assert(i < ARRAY_SIZE(ctx->shader->input));
881 			ctx->shader->input[i].name = d->Semantic.Name;
882 			ctx->shader->input[i].sid = d->Semantic.Index + j;
883 			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
884 			ctx->shader->input[i].interpolate_location = d->Interp.Location;
885 			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
886 			if (ctx->type == PIPE_SHADER_FRAGMENT) {
887 				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
888 				switch (ctx->shader->input[i].name) {
889 				case TGSI_SEMANTIC_FACE:
890 					if (ctx->face_gpr != -1)
891 						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
892 					else
893 						ctx->face_gpr = ctx->shader->input[i].gpr;
894 					break;
895 				case TGSI_SEMANTIC_COLOR:
896 					ctx->colors_used++;
897 					break;
898 				case TGSI_SEMANTIC_POSITION:
899 					ctx->fragcoord_input = i;
900 					break;
901 				case TGSI_SEMANTIC_PRIMID:
902 					/* set this for now */
903 					ctx->shader->gs_prim_id_input = true;
904 					ctx->shader->ps_prim_id_input = i;
905 					break;
906 				}
907 				if (ctx->bc->chip_class >= EVERGREEN) {
908 					if ((r = evergreen_interp_input(ctx, i)))
909 						return r;
910 				}
911 			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
912 				/* FIXME probably skip inputs if they aren't passed in the ring */
913 				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
914 				ctx->next_ring_offset += 16;
915 				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
916 					ctx->shader->gs_prim_id_input = true;
917 			}
918 		}
919 		ctx->shader->ninput += count;
920 		break;
921 	case TGSI_FILE_OUTPUT:
922 		for (j = 0; j < count; j++) {
923 			i = ctx->shader->noutput + j;
924 			assert(i < ARRAY_SIZE(ctx->shader->output));
925 			ctx->shader->output[i].name = d->Semantic.Name;
926 			ctx->shader->output[i].sid = d->Semantic.Index + j;
927 			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
928 			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
929 			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
930 			if (ctx->type == PIPE_SHADER_VERTEX ||
931 			    ctx->type == PIPE_SHADER_GEOMETRY ||
932 			    ctx->type == PIPE_SHADER_TESS_EVAL) {
933 				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
934 				switch (d->Semantic.Name) {
935 				case TGSI_SEMANTIC_CLIPDIST:
936 					break;
937 				case TGSI_SEMANTIC_PSIZE:
938 					ctx->shader->vs_out_misc_write = 1;
939 					ctx->shader->vs_out_point_size = 1;
940 					break;
941 				case TGSI_SEMANTIC_EDGEFLAG:
942 					ctx->shader->vs_out_misc_write = 1;
943 					ctx->shader->vs_out_edgeflag = 1;
944 					ctx->edgeflag_output = i;
945 					break;
946 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
947 					ctx->shader->vs_out_misc_write = 1;
948 					ctx->shader->vs_out_viewport = 1;
949 					break;
950 				case TGSI_SEMANTIC_LAYER:
951 					ctx->shader->vs_out_misc_write = 1;
952 					ctx->shader->vs_out_layer = 1;
953 					break;
954 				case TGSI_SEMANTIC_CLIPVERTEX:
955 					ctx->clip_vertex_write = TRUE;
956 					ctx->cv_output = i;
957 					break;
958 				}
959 				if (ctx->type == PIPE_SHADER_GEOMETRY) {
960 					ctx->gs_out_ring_offset += 16;
961 				}
962 			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
963 				switch (d->Semantic.Name) {
964 				case TGSI_SEMANTIC_COLOR:
965 					ctx->shader->nr_ps_max_color_exports++;
966 					break;
967 				}
968 			}
969 		}
970 		ctx->shader->noutput += count;
971 		break;
972 	case TGSI_FILE_TEMPORARY:
973 		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
974 			if (d->Array.ArrayID) {
975 				r600_add_gpr_array(ctx->shader,
976 				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
977 								   d->Range.First,
978 				               d->Range.Last - d->Range.First + 1, 0x0F);
979 			}
980 		}
981 		break;
982 
983 	case TGSI_FILE_CONSTANT:
984 	case TGSI_FILE_SAMPLER:
985 	case TGSI_FILE_SAMPLER_VIEW:
986 	case TGSI_FILE_ADDRESS:
987 	case TGSI_FILE_BUFFER:
988 	case TGSI_FILE_IMAGE:
989 	case TGSI_FILE_MEMORY:
990 		break;
991 
992 	case TGSI_FILE_HW_ATOMIC:
993 		i = ctx->shader->nhwatomic_ranges;
994 		ctx->shader->atomics[i].start = d->Range.First;
995 		ctx->shader->atomics[i].end = d->Range.Last;
996 		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
997 		ctx->shader->atomics[i].array_id = d->Array.ArrayID;
998 		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
999 		ctx->shader->nhwatomic_ranges++;
1000 		ctx->shader->nhwatomic += count;
1001 		break;
1002 
1003 	case TGSI_FILE_SYSTEM_VALUE:
1004 		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1005 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1006 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1007 			break; /* Already handled from allocate_system_value_inputs */
1008 		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1009 			break;
1010 		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1011 			break;
1012 		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1013 			break;
1014 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1015 			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1016 			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1017 			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1018 			unsigned temp_reg = r600_get_temp(ctx);
1019 
1020 			r = get_lds_offset0(ctx, 2, temp_reg, true);
1021 			if (r)
1022 				return r;
1023 
1024 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1025 					   temp_reg, 0,
1026 					   temp_reg, 0,
1027 					   V_SQ_ALU_SRC_LITERAL, param * 16);
1028 			if (r)
1029 				return r;
1030 
1031 			do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1032 		}
1033 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1034 			/* MOV r1.x, r0.x;
1035 			   MOV r1.y, r0.y;
1036 			*/
1037 			for (i = 0; i < 2; i++) {
1038 				struct r600_bytecode_alu alu;
1039 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1040 				alu.op = ALU_OP1_MOV;
1041 				alu.src[0].sel = 0;
1042 				alu.src[0].chan = 0 + i;
1043 				alu.dst.sel = 1;
1044 				alu.dst.chan = 0 + i;
1045 				alu.dst.write = 1;
1046 				alu.last = (i == 1) ? 1 : 0;
1047 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1048 					return r;
1049 			}
1050 			/* ADD r1.z, 1.0f, -r0.x */
1051 			struct r600_bytecode_alu alu;
1052 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1053 			alu.op = ALU_OP2_ADD;
1054 			alu.src[0].sel = V_SQ_ALU_SRC_1;
1055 			alu.src[1].sel = 1;
1056 			alu.src[1].chan = 0;
1057 			alu.src[1].neg = 1;
1058 			alu.dst.sel = 1;
1059 			alu.dst.chan = 2;
1060 			alu.dst.write = 1;
1061 			alu.last = 1;
1062 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1063 				return r;
1064 
1065 			/* ADD r1.z, r1.z, -r1.y */
1066 			alu.op = ALU_OP2_ADD;
1067 			alu.src[0].sel = 1;
1068 			alu.src[0].chan = 2;
1069 			alu.src[1].sel = 1;
1070 			alu.src[1].chan = 1;
1071 			alu.src[1].neg = 1;
1072 			alu.dst.sel = 1;
1073 			alu.dst.chan = 2;
1074 			alu.dst.write = 1;
1075 			alu.last = 1;
1076 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1077 				return r;
1078 			break;
1079 		}
1080 		break;
1081 	default:
1082 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1083 		return -EINVAL;
1084 	}
1085 	return 0;
1086 }
1087 
allocate_system_value_inputs(struct r600_shader_ctx * ctx,int gpr_offset)1088 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1089 {
1090 	struct tgsi_parse_context parse;
1091 	struct {
1092 		boolean enabled;
1093 		int *reg;
1094 		unsigned name, alternate_name;
1095 	} inputs[2] = {
1096 		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1097 
1098 		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1099 	};
1100 	int num_regs = 0;
1101 	unsigned k, i;
1102 
1103 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1104 		return 0;
1105 	}
1106 
1107 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1108 	while (!tgsi_parse_end_of_tokens(&parse)) {
1109 		tgsi_parse_token(&parse);
1110 
1111 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1112 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1113 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1114 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1115 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1116 			{
1117 				int interpolate, location, k;
1118 
1119 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1120 					location = TGSI_INTERPOLATE_LOC_CENTER;
1121 					inputs[1].enabled = true; /* needs SAMPLEID */
1122 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1123 					location = TGSI_INTERPOLATE_LOC_CENTER;
1124 					/* Needs sample positions, currently those are always available */
1125 				} else {
1126 					location = TGSI_INTERPOLATE_LOC_CENTROID;
1127 				}
1128 
1129 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1130 				k = eg_get_interpolator_index(interpolate, location);
1131 				if (k >= 0)
1132 					ctx->eg_interpolators[k].enabled = true;
1133 			}
1134 		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1135 			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1136 			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1137 				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1138 					if (d->Semantic.Name == inputs[k].name ||
1139 						d->Semantic.Name == inputs[k].alternate_name) {
1140 						inputs[k].enabled = true;
1141 					}
1142 				}
1143 			}
1144 		}
1145 	}
1146 
1147 	tgsi_parse_free(&parse);
1148 
1149 	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1150 		boolean enabled = inputs[i].enabled;
1151 		int *reg = inputs[i].reg;
1152 		unsigned name = inputs[i].name;
1153 
1154 		if (enabled) {
1155 			int gpr = gpr_offset + num_regs++;
1156 			ctx->shader->nsys_inputs++;
1157 
1158 			// add to inputs, allocate a gpr
1159 			k = ctx->shader->ninput++;
1160 			ctx->shader->input[k].name = name;
1161 			ctx->shader->input[k].sid = 0;
1162 			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1163 			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1164 			*reg = ctx->shader->input[k].gpr = gpr;
1165 		}
1166 	}
1167 
1168 	return gpr_offset + num_regs;
1169 }
1170 
1171 /*
1172  * for evergreen we need to scan the shader to find the number of GPRs we need to
1173  * reserve for interpolation and system values
1174  *
1175  * we need to know if we are going to emit
1176  * any sample or centroid inputs
1177  * if perspective and linear are required
1178 */
evergreen_gpr_count(struct r600_shader_ctx * ctx)1179 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1180 {
1181 	unsigned i;
1182 	int num_baryc;
1183 	struct tgsi_parse_context parse;
1184 
1185 	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1186 
1187 	for (i = 0; i < ctx->info.num_inputs; i++) {
1188 		int k;
1189 		/* skip position/face/mask/sampleid */
1190 		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1191 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1192 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1193 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1194 			continue;
1195 
1196 		k = eg_get_interpolator_index(
1197 			ctx->info.input_interpolate[i],
1198 			ctx->info.input_interpolate_loc[i]);
1199 		if (k >= 0)
1200 			ctx->eg_interpolators[k].enabled = TRUE;
1201 	}
1202 
1203 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1204 		return 0;
1205 	}
1206 
1207 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1208 	while (!tgsi_parse_end_of_tokens(&parse)) {
1209 		tgsi_parse_token(&parse);
1210 
1211 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1212 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1213 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1214 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1215 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1216 			{
1217 				int interpolate, location, k;
1218 
1219 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1220 					location = TGSI_INTERPOLATE_LOC_CENTER;
1221 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1222 					location = TGSI_INTERPOLATE_LOC_CENTER;
1223 				} else {
1224 					location = TGSI_INTERPOLATE_LOC_CENTROID;
1225 				}
1226 
1227 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1228 				k = eg_get_interpolator_index(interpolate, location);
1229 				if (k >= 0)
1230 					ctx->eg_interpolators[k].enabled = true;
1231 			}
1232 		}
1233 	}
1234 
1235 	tgsi_parse_free(&parse);
1236 
1237 	/* assign gpr to each interpolator according to priority */
1238 	num_baryc = 0;
1239 	for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1240 		if (ctx->eg_interpolators[i].enabled) {
1241 			ctx->eg_interpolators[i].ij_index = num_baryc;
1242 			num_baryc ++;
1243 		}
1244 	}
1245 
1246 	/* XXX PULL MODEL and LINE STIPPLE */
1247 
1248 	num_baryc = (num_baryc + 1) >> 1;
1249 	return allocate_system_value_inputs(ctx, num_baryc);
1250 }
1251 
1252 /* sample_id_sel == NULL means fetch for current sample */
load_sample_position(struct r600_shader_ctx * ctx,struct r600_shader_src * sample_id,int chan_sel)1253 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1254 {
1255 	struct r600_bytecode_vtx vtx;
1256 	int r, t1;
1257 
1258 	assert(ctx->fixed_pt_position_gpr != -1);
1259 
1260 	t1 = r600_get_temp(ctx);
1261 
1262 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1263 	vtx.op = FETCH_OP_VFETCH;
1264 	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1265 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1266 	if (sample_id == NULL) {
1267 		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1268 		vtx.src_sel_x = 3;
1269 	}
1270 	else {
1271 		struct r600_bytecode_alu alu;
1272 
1273 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1274 		alu.op = ALU_OP1_MOV;
1275 		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1276 		alu.dst.sel = t1;
1277 		alu.dst.write = 1;
1278 		alu.last = 1;
1279 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1280 		if (r)
1281 			return r;
1282 
1283 		vtx.src_gpr = t1;
1284 		vtx.src_sel_x = 0;
1285 	}
1286 	vtx.mega_fetch_count = 16;
1287 	vtx.dst_gpr = t1;
1288 	vtx.dst_sel_x = 0;
1289 	vtx.dst_sel_y = 1;
1290 	vtx.dst_sel_z = 2;
1291 	vtx.dst_sel_w = 3;
1292 	vtx.data_format = FMT_32_32_32_32_FLOAT;
1293 	vtx.num_format_all = 2;
1294 	vtx.format_comp_all = 1;
1295 	vtx.use_const_fields = 0;
1296 	vtx.offset = 0;
1297 	vtx.endian = r600_endian_swap(32);
1298 	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1299 
1300 	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1301 	if (r)
1302 		return r;
1303 
1304 	return t1;
1305 }
1306 
load_block_grid_size(struct r600_shader_ctx * ctx,bool load_block)1307 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1308 {
1309 	struct r600_bytecode_vtx vtx;
1310 	int r, t1;
1311 
1312 	if (ctx->cs_block_size_loaded)
1313 		return ctx->cs_block_size_reg;
1314 	if (ctx->cs_grid_size_loaded)
1315 		return ctx->cs_grid_size_reg;
1316 
1317 	t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1318 	struct r600_bytecode_alu alu;
1319 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1320 	alu.op = ALU_OP1_MOV;
1321 	alu.src[0].sel = V_SQ_ALU_SRC_0;
1322 	alu.dst.sel = t1;
1323 	alu.dst.write = 1;
1324 	alu.last = 1;
1325 	r = r600_bytecode_add_alu(ctx->bc, &alu);
1326 	if (r)
1327 		return r;
1328 
1329 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1330 	vtx.op = FETCH_OP_VFETCH;
1331 	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1332 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1333 	vtx.src_gpr = t1;
1334 	vtx.src_sel_x = 0;
1335 
1336 	vtx.mega_fetch_count = 16;
1337 	vtx.dst_gpr = t1;
1338 	vtx.dst_sel_x = 0;
1339 	vtx.dst_sel_y = 1;
1340 	vtx.dst_sel_z = 2;
1341 	vtx.dst_sel_w = 7;
1342 	vtx.data_format = FMT_32_32_32_32;
1343 	vtx.num_format_all = 1;
1344 	vtx.format_comp_all = 0;
1345 	vtx.use_const_fields = 0;
1346 	vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1347 	vtx.endian = r600_endian_swap(32);
1348 	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1349 
1350 	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1351 	if (r)
1352 		return r;
1353 
1354 	if (load_block)
1355 		ctx->cs_block_size_loaded = true;
1356 	else
1357 		ctx->cs_grid_size_loaded = true;
1358 	return t1;
1359 }
1360 
tgsi_src(struct r600_shader_ctx * ctx,const struct tgsi_full_src_register * tgsi_src,struct r600_shader_src * r600_src)1361 static void tgsi_src(struct r600_shader_ctx *ctx,
1362 		     const struct tgsi_full_src_register *tgsi_src,
1363 		     struct r600_shader_src *r600_src)
1364 {
1365 	memset(r600_src, 0, sizeof(*r600_src));
1366 	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1367 	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1368 	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1369 	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1370 	r600_src->neg = tgsi_src->Register.Negate;
1371 	r600_src->abs = tgsi_src->Register.Absolute;
1372 
1373 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1374 		int index;
1375 		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1376 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1377 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1378 
1379 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1380 			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1381 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1382 				return;
1383 		}
1384 		index = tgsi_src->Register.Index;
1385 		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1386 		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1387 	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1388 		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1389 			r600_src->swizzle[0] = 2; // Z value
1390 			r600_src->swizzle[1] = 2;
1391 			r600_src->swizzle[2] = 2;
1392 			r600_src->swizzle[3] = 2;
1393 			r600_src->sel = ctx->face_gpr;
1394 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1395 			r600_src->swizzle[0] = 3; // W value
1396 			r600_src->swizzle[1] = 3;
1397 			r600_src->swizzle[2] = 3;
1398 			r600_src->swizzle[3] = 3;
1399 			r600_src->sel = ctx->fixed_pt_position_gpr;
1400 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1401 			r600_src->swizzle[0] = 0;
1402 			r600_src->swizzle[1] = 1;
1403 			r600_src->swizzle[2] = 4;
1404 			r600_src->swizzle[3] = 4;
1405 			r600_src->sel = load_sample_position(ctx, NULL, -1);
1406 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1407 			r600_src->swizzle[0] = 3;
1408 			r600_src->swizzle[1] = 3;
1409 			r600_src->swizzle[2] = 3;
1410 			r600_src->swizzle[3] = 3;
1411 			r600_src->sel = 0;
1412 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1413 			r600_src->swizzle[0] = 0;
1414 			r600_src->swizzle[1] = 0;
1415 			r600_src->swizzle[2] = 0;
1416 			r600_src->swizzle[3] = 0;
1417 			r600_src->sel = 0;
1418 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1419 			r600_src->sel = 0;
1420 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1421 			r600_src->sel = 1;
1422 		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1423 			r600_src->swizzle[0] = 3;
1424 			r600_src->swizzle[1] = 3;
1425 			r600_src->swizzle[2] = 3;
1426 			r600_src->swizzle[3] = 3;
1427 			r600_src->sel = 1;
1428 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1429 			r600_src->swizzle[0] = 2;
1430 			r600_src->swizzle[1] = 2;
1431 			r600_src->swizzle[2] = 2;
1432 			r600_src->swizzle[3] = 2;
1433 			r600_src->sel = 0;
1434 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1435 			r600_src->sel = 1;
1436 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1437 			r600_src->sel = 3;
1438 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1439 			r600_src->sel = 2;
1440 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1441 			if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1442 				r600_src->sel = ctx->tess_input_info;
1443 				r600_src->swizzle[0] = 2;
1444 				r600_src->swizzle[1] = 2;
1445 				r600_src->swizzle[2] = 2;
1446 				r600_src->swizzle[3] = 2;
1447 			} else {
1448 				r600_src->sel = ctx->tess_input_info;
1449 				r600_src->swizzle[0] = 3;
1450 				r600_src->swizzle[1] = 3;
1451 				r600_src->swizzle[2] = 3;
1452 				r600_src->swizzle[3] = 3;
1453 			}
1454 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1455 			r600_src->sel = 0;
1456 			r600_src->swizzle[0] = 0;
1457 			r600_src->swizzle[1] = 0;
1458 			r600_src->swizzle[2] = 0;
1459 			r600_src->swizzle[3] = 0;
1460 		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1461 			r600_src->sel = 0;
1462 			r600_src->swizzle[0] = 3;
1463 			r600_src->swizzle[1] = 3;
1464 			r600_src->swizzle[2] = 3;
1465 			r600_src->swizzle[3] = 3;
1466 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1467 			r600_src->sel = load_block_grid_size(ctx, false);
1468 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1469 			r600_src->sel = load_block_grid_size(ctx, true);
1470 		}
1471 	} else {
1472 		if (tgsi_src->Register.Indirect)
1473 			r600_src->rel = V_SQ_REL_RELATIVE;
1474 		r600_src->sel = tgsi_src->Register.Index;
1475 		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1476 	}
1477 	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1478 		if (tgsi_src->Register.Dimension) {
1479 			r600_src->kc_bank = tgsi_src->Dimension.Index;
1480 			if (tgsi_src->Dimension.Indirect) {
1481 				r600_src->kc_rel = 1;
1482 			}
1483 		}
1484 	}
1485 }
1486 
tgsi_fetch_rel_const(struct r600_shader_ctx * ctx,unsigned int cb_idx,unsigned cb_rel,unsigned int offset,unsigned ar_chan,unsigned int dst_reg)1487 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1488                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1489                                 unsigned int dst_reg)
1490 {
1491 	struct r600_bytecode_vtx vtx;
1492 	unsigned int ar_reg;
1493 	int r;
1494 
1495 	if (offset) {
1496 		struct r600_bytecode_alu alu;
1497 
1498 		memset(&alu, 0, sizeof(alu));
1499 
1500 		alu.op = ALU_OP2_ADD_INT;
1501 		alu.src[0].sel = ctx->bc->ar_reg;
1502 		alu.src[0].chan = ar_chan;
1503 
1504 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1505 		alu.src[1].value = offset;
1506 
1507 		alu.dst.sel = dst_reg;
1508 		alu.dst.chan = ar_chan;
1509 		alu.dst.write = 1;
1510 		alu.last = 1;
1511 
1512 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1513 			return r;
1514 
1515 		ar_reg = dst_reg;
1516 	} else {
1517 		ar_reg = ctx->bc->ar_reg;
1518 	}
1519 
1520 	memset(&vtx, 0, sizeof(vtx));
1521 	vtx.buffer_id = cb_idx;
1522 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1523 	vtx.src_gpr = ar_reg;
1524 	vtx.src_sel_x = ar_chan;
1525 	vtx.mega_fetch_count = 16;
1526 	vtx.dst_gpr = dst_reg;
1527 	vtx.dst_sel_x = 0;		/* SEL_X */
1528 	vtx.dst_sel_y = 1;		/* SEL_Y */
1529 	vtx.dst_sel_z = 2;		/* SEL_Z */
1530 	vtx.dst_sel_w = 3;		/* SEL_W */
1531 	vtx.data_format = FMT_32_32_32_32_FLOAT;
1532 	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1533 	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1534 	vtx.endian = r600_endian_swap(32);
1535 	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1536 
1537 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1538 		return r;
1539 
1540 	return 0;
1541 }
1542 
fetch_gs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1543 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1544 {
1545 	struct r600_bytecode_vtx vtx;
1546 	int r;
1547 	unsigned index = src->Register.Index;
1548 	unsigned vtx_id = src->Dimension.Index;
1549 	int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1550 	int offset_chan = vtx_id % 3;
1551 	int t2 = 0;
1552 
1553 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1554 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1555 
1556 	if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1557 		offset_chan = 3;
1558 
1559 	if (src->Dimension.Indirect || src->Register.Indirect)
1560 		t2 = r600_get_temp(ctx);
1561 
1562 	if (src->Dimension.Indirect) {
1563 		int treg[3];
1564 		struct r600_bytecode_alu alu;
1565 		int r, i;
1566 		unsigned addr_reg;
1567 		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1568 		if (src->DimIndirect.Index > 0) {
1569 			r = single_alu_op2(ctx, ALU_OP1_MOV,
1570 					   ctx->bc->ar_reg, 0,
1571 					   addr_reg, 0,
1572 					   0, 0);
1573 			if (r)
1574 				return r;
1575 		}
1576 		/*
1577 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1578 		   at least this is what fglrx seems to do. */
1579 		for (i = 0; i < 3; i++) {
1580 			treg[i] = r600_get_temp(ctx);
1581 		}
1582 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1583 
1584 		for (i = 0; i < 3; i++) {
1585 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1586 			alu.op = ALU_OP1_MOV;
1587 			alu.src[0].sel = ctx->gs_rotated_input[0];
1588 			alu.src[0].chan = i == 2 ? 3 : i;
1589 			alu.dst.sel = treg[i];
1590 			alu.dst.chan = 0;
1591 			alu.dst.write = 1;
1592 			alu.last = 1;
1593 			r = r600_bytecode_add_alu(ctx->bc, &alu);
1594 			if (r)
1595 				return r;
1596 		}
1597 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1598 		alu.op = ALU_OP1_MOV;
1599 		alu.src[0].sel = treg[0];
1600 		alu.src[0].rel = 1;
1601 		alu.dst.sel = t2;
1602 		alu.dst.write = 1;
1603 		alu.last = 1;
1604 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1605 		if (r)
1606 			return r;
1607 		offset_reg = t2;
1608 		offset_chan = 0;
1609 	}
1610 
1611 	if (src->Register.Indirect) {
1612 		int addr_reg;
1613 		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1614 
1615 		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1616 
1617 		/* pull the value from index_reg */
1618 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1619 				   t2, 1,
1620 				   addr_reg, 0,
1621 				   V_SQ_ALU_SRC_LITERAL, first);
1622 		if (r)
1623 			return r;
1624 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1625 				   t2, 0,
1626 				   t2, 1,
1627 				   V_SQ_ALU_SRC_LITERAL, 4,
1628 				   offset_reg, offset_chan);
1629 		if (r)
1630 			return r;
1631 		offset_reg = t2;
1632 		offset_chan = 0;
1633 		index = src->Register.Index - first;
1634 	}
1635 
1636 	memset(&vtx, 0, sizeof(vtx));
1637 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1638 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1639 	vtx.src_gpr = offset_reg;
1640 	vtx.src_sel_x = offset_chan;
1641 	vtx.offset = index * 16; /*bytes*/
1642 	vtx.mega_fetch_count = 16;
1643 	vtx.dst_gpr = dst_reg;
1644 	vtx.dst_sel_x = 0;		/* SEL_X */
1645 	vtx.dst_sel_y = 1;		/* SEL_Y */
1646 	vtx.dst_sel_z = 2;		/* SEL_Z */
1647 	vtx.dst_sel_w = 3;		/* SEL_W */
1648 	if (ctx->bc->chip_class >= EVERGREEN) {
1649 		vtx.use_const_fields = 1;
1650 	} else {
1651 		vtx.data_format = FMT_32_32_32_32_FLOAT;
1652 	}
1653 
1654 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1655 		return r;
1656 
1657 	return 0;
1658 }
1659 
tgsi_split_gs_inputs(struct r600_shader_ctx * ctx)1660 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1661 {
1662 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1663 	unsigned i;
1664 
1665 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1666 		struct tgsi_full_src_register *src = &inst->Src[i];
1667 
1668 		if (src->Register.File == TGSI_FILE_INPUT) {
1669 			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1670 				/* primitive id is in R0.z */
1671 				ctx->src[i].sel = 0;
1672 				ctx->src[i].swizzle[0] = 2;
1673 			}
1674 		}
1675 		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1676 			int treg = r600_get_temp(ctx);
1677 
1678 			fetch_gs_input(ctx, src, treg);
1679 			ctx->src[i].sel = treg;
1680 			ctx->src[i].rel = 0;
1681 		}
1682 	}
1683 	return 0;
1684 }
1685 
1686 
1687 /* Tessellation shaders pass outputs to the next shader using LDS.
1688  *
1689  * LS outputs = TCS(HS) inputs
1690  * TCS(HS) outputs = TES(DS) inputs
1691  *
1692  * The LDS layout is:
1693  * - TCS inputs for patch 0
1694  * - TCS inputs for patch 1
1695  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1696  * - ...
1697  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1698  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1699  * - TCS outputs for patch 1
1700  * - Per-patch TCS outputs for patch 1
1701  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1702  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1703  * - ...
1704  *
1705  * All three shaders VS(LS), TCS, TES share the same LDS space.
1706  */
1707 /* this will return with the dw address in temp_reg.x */
r600_get_byte_address(struct r600_shader_ctx * ctx,int temp_reg,const struct tgsi_full_dst_register * dst,const struct tgsi_full_src_register * src,int stride_bytes_reg,int stride_bytes_chan)1708 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1709 				 const struct tgsi_full_dst_register *dst,
1710 				 const struct tgsi_full_src_register *src,
1711 				 int stride_bytes_reg, int stride_bytes_chan)
1712 {
1713 	struct tgsi_full_dst_register reg;
1714 	ubyte *name, *index, *array_first;
1715 	int r;
1716 	int param;
1717 	struct tgsi_shader_info *info = &ctx->info;
1718 	/* Set the register description. The address computation is the same
1719 	 * for sources and destinations. */
1720 	if (src) {
1721 		reg.Register.File = src->Register.File;
1722 		reg.Register.Index = src->Register.Index;
1723 		reg.Register.Indirect = src->Register.Indirect;
1724 		reg.Register.Dimension = src->Register.Dimension;
1725 		reg.Indirect = src->Indirect;
1726 		reg.Dimension = src->Dimension;
1727 		reg.DimIndirect = src->DimIndirect;
1728 	} else
1729 		reg = *dst;
1730 
1731 	/* If the register is 2-dimensional (e.g. an array of vertices
1732 	 * in a primitive), calculate the base address of the vertex. */
1733 	if (reg.Register.Dimension) {
1734 		int sel, chan;
1735 		if (reg.Dimension.Indirect) {
1736 			unsigned addr_reg;
1737 			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1738 
1739 			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1740 			/* pull the value from index_reg */
1741 			sel = addr_reg;
1742 			chan = 0;
1743 		} else {
1744 			sel = V_SQ_ALU_SRC_LITERAL;
1745 			chan = reg.Dimension.Index;
1746 		}
1747 
1748 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1749 				   temp_reg, 0,
1750 				   stride_bytes_reg, stride_bytes_chan,
1751 				   sel, chan,
1752 				   temp_reg, 0);
1753 		if (r)
1754 			return r;
1755 	}
1756 
1757 	if (reg.Register.File == TGSI_FILE_INPUT) {
1758 		name = info->input_semantic_name;
1759 		index = info->input_semantic_index;
1760 		array_first = info->input_array_first;
1761 	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1762 		name = info->output_semantic_name;
1763 		index = info->output_semantic_index;
1764 		array_first = info->output_array_first;
1765 	} else {
1766 		assert(0);
1767 		return -1;
1768 	}
1769 	if (reg.Register.Indirect) {
1770 		int addr_reg;
1771 		int first;
1772 		/* Add the relative address of the element. */
1773 		if (reg.Indirect.ArrayID)
1774 			first = array_first[reg.Indirect.ArrayID];
1775 		else
1776 			first = reg.Register.Index;
1777 
1778 		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1779 
1780 		/* pull the value from index_reg */
1781 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1782 				   temp_reg, 0,
1783 				   V_SQ_ALU_SRC_LITERAL, 16,
1784 				   addr_reg, 0,
1785 				   temp_reg, 0);
1786 		if (r)
1787 			return r;
1788 
1789 		param = r600_get_lds_unique_index(name[first],
1790 						  index[first]);
1791 
1792 	} else {
1793 		param = r600_get_lds_unique_index(name[reg.Register.Index],
1794 						  index[reg.Register.Index]);
1795 	}
1796 
1797 	/* add to base_addr - passed in temp_reg.x */
1798 	if (param) {
1799 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1800 				   temp_reg, 0,
1801 				   temp_reg, 0,
1802 				   V_SQ_ALU_SRC_LITERAL, param * 16);
1803 		if (r)
1804 			return r;
1805 
1806 	}
1807 	return 0;
1808 }
1809 
do_lds_fetch_values(struct r600_shader_ctx * ctx,unsigned temp_reg,unsigned dst_reg,unsigned mask)1810 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1811 			       unsigned dst_reg, unsigned mask)
1812 {
1813 	struct r600_bytecode_alu alu;
1814 	int r, i, lasti;
1815 
1816 	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1817 		ctx->bc->force_add_cf = 1;
1818 
1819 	lasti = tgsi_last_instruction(mask);
1820 	for (i = 1; i <= lasti; i++) {
1821 		if (!(mask & (1 << i)))
1822 			continue;
1823 
1824 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1825 				   temp_reg, i,
1826 				   temp_reg, 0,
1827 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
1828 		if (r)
1829 			return r;
1830 	}
1831 	for (i = 0; i <= lasti; i++) {
1832 		if (!(mask & (1 << i)))
1833 			continue;
1834 
1835 		/* emit an LDS_READ_RET */
1836 		memset(&alu, 0, sizeof(alu));
1837 		alu.op = LDS_OP1_LDS_READ_RET;
1838 		alu.src[0].sel = temp_reg;
1839 		alu.src[0].chan = i;
1840 		alu.src[1].sel = V_SQ_ALU_SRC_0;
1841 		alu.src[2].sel = V_SQ_ALU_SRC_0;
1842 		alu.dst.chan = 0;
1843 		alu.is_lds_idx_op = true;
1844 		alu.last = 1;
1845 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1846 		if (r)
1847 			return r;
1848 	}
1849 	for (i = 0; i <= lasti; i++) {
1850 		if (!(mask & (1 << i)))
1851 			continue;
1852 
1853 		/* then read from LDS_OQ_A_POP */
1854 		memset(&alu, 0, sizeof(alu));
1855 
1856 		alu.op = ALU_OP1_MOV;
1857 		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1858 		alu.src[0].chan = 0;
1859 		alu.dst.sel = dst_reg;
1860 		alu.dst.chan = i;
1861 		alu.dst.write = 1;
1862 		alu.last = 1;
1863 		r = r600_bytecode_add_alu(ctx->bc, &alu);
1864 		if (r)
1865 			return r;
1866 	}
1867 	return 0;
1868 }
1869 
fetch_mask(struct tgsi_src_register * reg)1870 static int fetch_mask(struct tgsi_src_register *reg)
1871 {
1872 	int mask = 0;
1873 	mask |= 1 << reg->SwizzleX;
1874 	mask |= 1 << reg->SwizzleY;
1875 	mask |= 1 << reg->SwizzleZ;
1876 	mask |= 1 << reg->SwizzleW;
1877 	return mask;
1878 }
1879 
fetch_tes_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1880 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1881 {
1882 	int r;
1883 	unsigned temp_reg = r600_get_temp(ctx);
1884 
1885 	r = get_lds_offset0(ctx, 2, temp_reg,
1886 			    src->Register.Dimension ? false : true);
1887 	if (r)
1888 		return r;
1889 
1890 	/* the base address is now in temp.x */
1891 	r = r600_get_byte_address(ctx, temp_reg,
1892 				  NULL, src, ctx->tess_output_info, 1);
1893 	if (r)
1894 		return r;
1895 
1896 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1897 	if (r)
1898 		return r;
1899 	return 0;
1900 }
1901 
fetch_tcs_input(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1902 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1903 {
1904 	int r;
1905 	unsigned temp_reg = r600_get_temp(ctx);
1906 
1907 	/* t.x = ips * r0.y */
1908 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1909 			   temp_reg, 0,
1910 			   ctx->tess_input_info, 0,
1911 			   0, 1);
1912 
1913 	if (r)
1914 		return r;
1915 
1916 	/* the base address is now in temp.x */
1917 	r = r600_get_byte_address(ctx, temp_reg,
1918 				  NULL, src, ctx->tess_input_info, 1);
1919 	if (r)
1920 		return r;
1921 
1922 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1923 	if (r)
1924 		return r;
1925 	return 0;
1926 }
1927 
fetch_tcs_output(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src,unsigned int dst_reg)1928 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1929 {
1930 	int r;
1931 	unsigned temp_reg = r600_get_temp(ctx);
1932 
1933 	r = get_lds_offset0(ctx, 1, temp_reg,
1934 			    src->Register.Dimension ? false : true);
1935 	if (r)
1936 		return r;
1937 	/* the base address is now in temp.x */
1938 	r = r600_get_byte_address(ctx, temp_reg,
1939 				  NULL, src,
1940 				  ctx->tess_output_info, 1);
1941 	if (r)
1942 		return r;
1943 
1944 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1945 	if (r)
1946 		return r;
1947 	return 0;
1948 }
1949 
tgsi_split_lds_inputs(struct r600_shader_ctx * ctx)1950 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1951 {
1952 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1953 	unsigned i;
1954 
1955 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1956 		struct tgsi_full_src_register *src = &inst->Src[i];
1957 
1958 		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1959 			int treg = r600_get_temp(ctx);
1960 			fetch_tes_input(ctx, src, treg);
1961 			ctx->src[i].sel = treg;
1962 			ctx->src[i].rel = 0;
1963 		}
1964 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1965 			int treg = r600_get_temp(ctx);
1966 			fetch_tcs_input(ctx, src, treg);
1967 			ctx->src[i].sel = treg;
1968 			ctx->src[i].rel = 0;
1969 		}
1970 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1971 			int treg = r600_get_temp(ctx);
1972 			fetch_tcs_output(ctx, src, treg);
1973 			ctx->src[i].sel = treg;
1974 			ctx->src[i].rel = 0;
1975 		}
1976 	}
1977 	return 0;
1978 }
1979 
tgsi_split_constant(struct r600_shader_ctx * ctx)1980 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1981 {
1982 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1983 	struct r600_bytecode_alu alu;
1984 	int i, j, k, nconst, r;
1985 
1986 	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1987 		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1988 			nconst++;
1989 		}
1990 		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1991 	}
1992 	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1993 		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1994 			continue;
1995 		}
1996 
1997 		if (ctx->src[i].rel) {
1998 			int chan = inst->Src[i].Indirect.Swizzle;
1999 			int treg = r600_get_temp(ctx);
2000 			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2001 				return r;
2002 
2003 			ctx->src[i].kc_bank = 0;
2004 			ctx->src[i].kc_rel = 0;
2005 			ctx->src[i].sel = treg;
2006 			ctx->src[i].rel = 0;
2007 			j--;
2008 		} else if (j > 0) {
2009 			int treg = r600_get_temp(ctx);
2010 			for (k = 0; k < 4; k++) {
2011 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2012 				alu.op = ALU_OP1_MOV;
2013 				alu.src[0].sel = ctx->src[i].sel;
2014 				alu.src[0].chan = k;
2015 				alu.src[0].rel = ctx->src[i].rel;
2016 				alu.src[0].kc_bank = ctx->src[i].kc_bank;
2017 				alu.src[0].kc_rel = ctx->src[i].kc_rel;
2018 				alu.dst.sel = treg;
2019 				alu.dst.chan = k;
2020 				alu.dst.write = 1;
2021 				if (k == 3)
2022 					alu.last = 1;
2023 				r = r600_bytecode_add_alu(ctx->bc, &alu);
2024 				if (r)
2025 					return r;
2026 			}
2027 			ctx->src[i].sel = treg;
2028 			ctx->src[i].rel =0;
2029 			j--;
2030 		}
2031 	}
2032 	return 0;
2033 }
2034 
2035 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
tgsi_split_literal_constant(struct r600_shader_ctx * ctx)2036 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2037 {
2038 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2039 	struct r600_bytecode_alu alu;
2040 	int i, j, k, nliteral, r;
2041 
2042 	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2043 		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2044 			nliteral++;
2045 		}
2046 	}
2047 	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2048 		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2049 			int treg = r600_get_temp(ctx);
2050 			for (k = 0; k < 4; k++) {
2051 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2052 				alu.op = ALU_OP1_MOV;
2053 				alu.src[0].sel = ctx->src[i].sel;
2054 				alu.src[0].chan = k;
2055 				alu.src[0].value = ctx->src[i].value[k];
2056 				alu.dst.sel = treg;
2057 				alu.dst.chan = k;
2058 				alu.dst.write = 1;
2059 				if (k == 3)
2060 					alu.last = 1;
2061 				r = r600_bytecode_add_alu(ctx->bc, &alu);
2062 				if (r)
2063 					return r;
2064 			}
2065 			ctx->src[i].sel = treg;
2066 			j--;
2067 		}
2068 	}
2069 	return 0;
2070 }
2071 
process_twoside_color_inputs(struct r600_shader_ctx * ctx)2072 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2073 {
2074 	int i, r, count = ctx->shader->ninput;
2075 
2076 	for (i = 0; i < count; i++) {
2077 		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2078 			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2079 			if (r)
2080 				return r;
2081 		}
2082 	}
2083 	return 0;
2084 }
2085 
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)2086 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2087 						  int stream, unsigned *stream_item_size UNUSED)
2088 {
2089 	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2090 	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2091 	int j, r;
2092 	unsigned i;
2093 
2094 	/* Sanity checking. */
2095 	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2096 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2097 		r = -EINVAL;
2098 		goto out_err;
2099 	}
2100 	for (i = 0; i < so->num_outputs; i++) {
2101 		if (so->output[i].output_buffer >= 4) {
2102 			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2103 				 so->output[i].output_buffer);
2104 			r = -EINVAL;
2105 			goto out_err;
2106 		}
2107 	}
2108 
2109 	/* Initialize locations where the outputs are stored. */
2110 	for (i = 0; i < so->num_outputs; i++) {
2111 
2112 		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2113 		start_comp[i] = so->output[i].start_component;
2114 		/* Lower outputs with dst_offset < start_component.
2115 		 *
2116 		 * We can only output 4D vectors with a write mask, e.g. we can
2117 		 * only output the W component at offset 3, etc. If we want
2118 		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2119 		 * to move it to X and output X. */
2120 		if (so->output[i].dst_offset < so->output[i].start_component) {
2121 			unsigned tmp = r600_get_temp(ctx);
2122 
2123 			for (j = 0; j < so->output[i].num_components; j++) {
2124 				struct r600_bytecode_alu alu;
2125 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2126 				alu.op = ALU_OP1_MOV;
2127 				alu.src[0].sel = so_gpr[i];
2128 				alu.src[0].chan = so->output[i].start_component + j;
2129 
2130 				alu.dst.sel = tmp;
2131 				alu.dst.chan = j;
2132 				alu.dst.write = 1;
2133 				if (j == so->output[i].num_components - 1)
2134 					alu.last = 1;
2135 				r = r600_bytecode_add_alu(ctx->bc, &alu);
2136 				if (r)
2137 					return r;
2138 			}
2139 			start_comp[i] = 0;
2140 			so_gpr[i] = tmp;
2141 		}
2142 	}
2143 
2144 	/* Write outputs to buffers. */
2145 	for (i = 0; i < so->num_outputs; i++) {
2146 		struct r600_bytecode_output output;
2147 
2148 		if (stream != -1 && stream != so->output[i].stream)
2149 			continue;
2150 
2151 		memset(&output, 0, sizeof(struct r600_bytecode_output));
2152 		output.gpr = so_gpr[i];
2153 		output.elem_size = so->output[i].num_components - 1;
2154 		if (output.elem_size == 2)
2155 			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2156 		output.array_base = so->output[i].dst_offset - start_comp[i];
2157 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2158 		output.burst_count = 1;
2159 		/* array_size is an upper limit for the burst_count
2160 		 * with MEM_STREAM instructions */
2161 		output.array_size = 0xFFF;
2162 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2163 
2164 		if (ctx->bc->chip_class >= EVERGREEN) {
2165 			switch (so->output[i].output_buffer) {
2166 			case 0:
2167 				output.op = CF_OP_MEM_STREAM0_BUF0;
2168 				break;
2169 			case 1:
2170 				output.op = CF_OP_MEM_STREAM0_BUF1;
2171 				break;
2172 			case 2:
2173 				output.op = CF_OP_MEM_STREAM0_BUF2;
2174 				break;
2175 			case 3:
2176 				output.op = CF_OP_MEM_STREAM0_BUF3;
2177 				break;
2178 			}
2179 			output.op += so->output[i].stream * 4;
2180 			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2181 			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2182 		} else {
2183 			switch (so->output[i].output_buffer) {
2184 			case 0:
2185 				output.op = CF_OP_MEM_STREAM0;
2186 				break;
2187 			case 1:
2188 				output.op = CF_OP_MEM_STREAM1;
2189 				break;
2190 			case 2:
2191 				output.op = CF_OP_MEM_STREAM2;
2192 				break;
2193 			case 3:
2194 				output.op = CF_OP_MEM_STREAM3;
2195 					break;
2196 			}
2197 			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2198 		}
2199 		r = r600_bytecode_add_output(ctx->bc, &output);
2200 		if (r)
2201 			goto out_err;
2202 	}
2203 	return 0;
2204 out_err:
2205 	return r;
2206 }
2207 
convert_edgeflag_to_int(struct r600_shader_ctx * ctx)2208 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2209 {
2210 	struct r600_bytecode_alu alu;
2211 	unsigned reg;
2212 
2213 	if (!ctx->shader->vs_out_edgeflag)
2214 		return;
2215 
2216 	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2217 
2218 	/* clamp(x, 0, 1) */
2219 	memset(&alu, 0, sizeof(alu));
2220 	alu.op = ALU_OP1_MOV;
2221 	alu.src[0].sel = reg;
2222 	alu.dst.sel = reg;
2223 	alu.dst.write = 1;
2224 	alu.dst.clamp = 1;
2225 	alu.last = 1;
2226 	r600_bytecode_add_alu(ctx->bc, &alu);
2227 
2228 	memset(&alu, 0, sizeof(alu));
2229 	alu.op = ALU_OP1_FLT_TO_INT;
2230 	alu.src[0].sel = reg;
2231 	alu.dst.sel = reg;
2232 	alu.dst.write = 1;
2233 	alu.last = 1;
2234 	r600_bytecode_add_alu(ctx->bc, &alu);
2235 }
2236 
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)2237 static int generate_gs_copy_shader(struct r600_context *rctx,
2238 				   struct r600_pipe_shader *gs,
2239 				   struct pipe_stream_output_info *so)
2240 {
2241 	struct r600_shader_ctx ctx = {};
2242 	struct r600_shader *gs_shader = &gs->shader;
2243 	struct r600_pipe_shader *cshader;
2244 	unsigned ocnt = gs_shader->noutput;
2245 	struct r600_bytecode_alu alu;
2246 	struct r600_bytecode_vtx vtx;
2247 	struct r600_bytecode_output output;
2248 	struct r600_bytecode_cf *cf_jump, *cf_pop,
2249 		*last_exp_pos = NULL, *last_exp_param = NULL;
2250 	int next_clip_pos = 61, next_param = 0;
2251 	unsigned i, j;
2252 	int ring;
2253 	bool only_ring_0 = true;
2254 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2255 	if (!cshader)
2256 		return 0;
2257 
2258 	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2259 	       sizeof(struct r600_shader_io));
2260 
2261 	cshader->shader.noutput = ocnt;
2262 
2263 	ctx.shader = &cshader->shader;
2264 	ctx.bc = &ctx.shader->bc;
2265 	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2266 
2267 	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2268 			   rctx->screen->has_compressed_msaa_texturing);
2269 
2270 	ctx.bc->isa = rctx->isa;
2271 
2272 	cf_jump = NULL;
2273 	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2274 
2275 	/* R0.x = R0.x & 0x3fffffff */
2276 	memset(&alu, 0, sizeof(alu));
2277 	alu.op = ALU_OP2_AND_INT;
2278 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2279 	alu.src[1].value = 0x3fffffff;
2280 	alu.dst.write = 1;
2281 	r600_bytecode_add_alu(ctx.bc, &alu);
2282 
2283 	/* R0.y = R0.x >> 30 */
2284 	memset(&alu, 0, sizeof(alu));
2285 	alu.op = ALU_OP2_LSHR_INT;
2286 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2287 	alu.src[1].value = 0x1e;
2288 	alu.dst.chan = 1;
2289 	alu.dst.write = 1;
2290 	alu.last = 1;
2291 	r600_bytecode_add_alu(ctx.bc, &alu);
2292 
2293 	/* fetch vertex data from GSVS ring */
2294 	for (i = 0; i < ocnt; ++i) {
2295 		struct r600_shader_io *out = &ctx.shader->output[i];
2296 
2297 		out->gpr = i + 1;
2298 		out->ring_offset = i * 16;
2299 
2300 		memset(&vtx, 0, sizeof(vtx));
2301 		vtx.op = FETCH_OP_VFETCH;
2302 		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2303 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2304 		vtx.mega_fetch_count = 16;
2305 		vtx.offset = out->ring_offset;
2306 		vtx.dst_gpr = out->gpr;
2307 		vtx.src_gpr = 0;
2308 		vtx.dst_sel_x = 0;
2309 		vtx.dst_sel_y = 1;
2310 		vtx.dst_sel_z = 2;
2311 		vtx.dst_sel_w = 3;
2312 		if (rctx->b.chip_class >= EVERGREEN) {
2313 			vtx.use_const_fields = 1;
2314 		} else {
2315 			vtx.data_format = FMT_32_32_32_32_FLOAT;
2316 		}
2317 
2318 		r600_bytecode_add_vtx(ctx.bc, &vtx);
2319 	}
2320 	ctx.temp_reg = i + 1;
2321 	for (ring = 3; ring >= 0; --ring) {
2322 		bool enabled = false;
2323 		for (i = 0; i < so->num_outputs; i++) {
2324 			if (so->output[i].stream == ring) {
2325 				enabled = true;
2326 				if (ring > 0)
2327 					only_ring_0 = false;
2328 				break;
2329 			}
2330 		}
2331 		if (ring != 0 && !enabled) {
2332 			cshader->shader.ring_item_sizes[ring] = 0;
2333 			continue;
2334 		}
2335 
2336 		if (cf_jump) {
2337 			// Patch up jump label
2338 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2339 			cf_pop = ctx.bc->cf_last;
2340 
2341 			cf_jump->cf_addr = cf_pop->id + 2;
2342 			cf_jump->pop_count = 1;
2343 			cf_pop->cf_addr = cf_pop->id + 2;
2344 			cf_pop->pop_count = 1;
2345 		}
2346 
2347 		/* PRED_SETE_INT __, R0.y, ring */
2348 		memset(&alu, 0, sizeof(alu));
2349 		alu.op = ALU_OP2_PRED_SETE_INT;
2350 		alu.src[0].chan = 1;
2351 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2352 		alu.src[1].value = ring;
2353 		alu.execute_mask = 1;
2354 		alu.update_pred = 1;
2355 		alu.last = 1;
2356 		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2357 
2358 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2359 		cf_jump = ctx.bc->cf_last;
2360 
2361 		if (enabled)
2362 			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2363 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2364 	}
2365 
2366 	/* bc adds nops - copy it */
2367 	if (ctx.bc->chip_class == R600) {
2368 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2369 		alu.op = ALU_OP0_NOP;
2370 		alu.last = 1;
2371 		r600_bytecode_add_alu(ctx.bc, &alu);
2372 
2373 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2374 	}
2375 
2376 	/* export vertex data */
2377 	/* XXX factor out common code with r600_shader_from_tgsi ? */
2378 	for (i = 0; i < ocnt; ++i) {
2379 		struct r600_shader_io *out = &ctx.shader->output[i];
2380 		bool instream0 = true;
2381 		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2382 			continue;
2383 
2384 		for (j = 0; j < so->num_outputs; j++) {
2385 			if (so->output[j].register_index == i) {
2386 				if (so->output[j].stream == 0)
2387 					break;
2388 				if (so->output[j].stream > 0)
2389 					instream0 = false;
2390 			}
2391 		}
2392 		if (!instream0)
2393 			continue;
2394 		memset(&output, 0, sizeof(output));
2395 		output.gpr = out->gpr;
2396 		output.elem_size = 3;
2397 		output.swizzle_x = 0;
2398 		output.swizzle_y = 1;
2399 		output.swizzle_z = 2;
2400 		output.swizzle_w = 3;
2401 		output.burst_count = 1;
2402 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2403 		output.op = CF_OP_EXPORT;
2404 		switch (out->name) {
2405 		case TGSI_SEMANTIC_POSITION:
2406 			output.array_base = 60;
2407 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2408 			break;
2409 
2410 		case TGSI_SEMANTIC_PSIZE:
2411 			output.array_base = 61;
2412 			if (next_clip_pos == 61)
2413 				next_clip_pos = 62;
2414 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2415 			output.swizzle_y = 7;
2416 			output.swizzle_z = 7;
2417 			output.swizzle_w = 7;
2418 			ctx.shader->vs_out_misc_write = 1;
2419 			ctx.shader->vs_out_point_size = 1;
2420 			break;
2421 		case TGSI_SEMANTIC_LAYER:
2422 			if (out->spi_sid) {
2423 				/* duplicate it as PARAM to pass to the pixel shader */
2424 				output.array_base = next_param++;
2425 				r600_bytecode_add_output(ctx.bc, &output);
2426 				last_exp_param = ctx.bc->cf_last;
2427 			}
2428 			output.array_base = 61;
2429 			if (next_clip_pos == 61)
2430 				next_clip_pos = 62;
2431 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2432 			output.swizzle_x = 7;
2433 			output.swizzle_y = 7;
2434 			output.swizzle_z = 0;
2435 			output.swizzle_w = 7;
2436 			ctx.shader->vs_out_misc_write = 1;
2437 			ctx.shader->vs_out_layer = 1;
2438 			break;
2439 		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2440 			if (out->spi_sid) {
2441 				/* duplicate it as PARAM to pass to the pixel shader */
2442 				output.array_base = next_param++;
2443 				r600_bytecode_add_output(ctx.bc, &output);
2444 				last_exp_param = ctx.bc->cf_last;
2445 			}
2446 			output.array_base = 61;
2447 			if (next_clip_pos == 61)
2448 				next_clip_pos = 62;
2449 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2450 			ctx.shader->vs_out_misc_write = 1;
2451 			ctx.shader->vs_out_viewport = 1;
2452 			output.swizzle_x = 7;
2453 			output.swizzle_y = 7;
2454 			output.swizzle_z = 7;
2455 			output.swizzle_w = 0;
2456 			break;
2457 		case TGSI_SEMANTIC_CLIPDIST:
2458 			/* spi_sid is 0 for clipdistance outputs that were generated
2459 			 * for clipvertex - we don't need to pass them to PS */
2460 			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2461 			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2462 			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2463 			if (out->spi_sid) {
2464 				/* duplicate it as PARAM to pass to the pixel shader */
2465 				output.array_base = next_param++;
2466 				r600_bytecode_add_output(ctx.bc, &output);
2467 				last_exp_param = ctx.bc->cf_last;
2468 			}
2469 			output.array_base = next_clip_pos++;
2470 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2471 			break;
2472 		case TGSI_SEMANTIC_FOG:
2473 			output.swizzle_y = 4; /* 0 */
2474 			output.swizzle_z = 4; /* 0 */
2475 			output.swizzle_w = 5; /* 1 */
2476 			break;
2477 		default:
2478 			output.array_base = next_param++;
2479 			break;
2480 		}
2481 		r600_bytecode_add_output(ctx.bc, &output);
2482 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2483 			last_exp_param = ctx.bc->cf_last;
2484 		else
2485 			last_exp_pos = ctx.bc->cf_last;
2486 	}
2487 
2488 	if (!last_exp_pos) {
2489 		memset(&output, 0, sizeof(output));
2490 		output.gpr = 0;
2491 		output.elem_size = 3;
2492 		output.swizzle_x = 7;
2493 		output.swizzle_y = 7;
2494 		output.swizzle_z = 7;
2495 		output.swizzle_w = 7;
2496 		output.burst_count = 1;
2497 		output.type = 2;
2498 		output.op = CF_OP_EXPORT;
2499 		output.array_base = 60;
2500 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2501 		r600_bytecode_add_output(ctx.bc, &output);
2502 		last_exp_pos = ctx.bc->cf_last;
2503 	}
2504 
2505 	if (!last_exp_param) {
2506 		memset(&output, 0, sizeof(output));
2507 		output.gpr = 0;
2508 		output.elem_size = 3;
2509 		output.swizzle_x = 7;
2510 		output.swizzle_y = 7;
2511 		output.swizzle_z = 7;
2512 		output.swizzle_w = 7;
2513 		output.burst_count = 1;
2514 		output.type = 2;
2515 		output.op = CF_OP_EXPORT;
2516 		output.array_base = next_param++;
2517 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2518 		r600_bytecode_add_output(ctx.bc, &output);
2519 		last_exp_param = ctx.bc->cf_last;
2520 	}
2521 
2522 	last_exp_pos->op = CF_OP_EXPORT_DONE;
2523 	last_exp_param->op = CF_OP_EXPORT_DONE;
2524 
2525 	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2526 	cf_pop = ctx.bc->cf_last;
2527 
2528 	cf_jump->cf_addr = cf_pop->id + 2;
2529 	cf_jump->pop_count = 1;
2530 	cf_pop->cf_addr = cf_pop->id + 2;
2531 	cf_pop->pop_count = 1;
2532 
2533 	if (ctx.bc->chip_class == CAYMAN)
2534 		cm_bytecode_add_cf_end(ctx.bc);
2535 	else {
2536 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2537 		ctx.bc->cf_last->end_of_program = 1;
2538 	}
2539 
2540 	gs->gs_copy_shader = cshader;
2541 	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2542 
2543 	ctx.bc->nstack = 1;
2544 
2545 	return r600_bytecode_build(ctx.bc);
2546 }
2547 
emit_inc_ring_offset(struct r600_shader_ctx * ctx,int idx,bool ind)2548 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2549 {
2550 	if (ind) {
2551 		struct r600_bytecode_alu alu;
2552 		int r;
2553 
2554 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2555 		alu.op = ALU_OP2_ADD_INT;
2556 		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2557 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2558 		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2559 		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2560 		alu.dst.write = 1;
2561 		alu.last = 1;
2562 		r = r600_bytecode_add_alu(ctx->bc, &alu);
2563 		if (r)
2564 			return r;
2565 	}
2566 	return 0;
2567 }
2568 
emit_gs_ring_writes(struct r600_shader_ctx * ctx,const struct pipe_stream_output_info * so UNUSED,int stream,bool ind)2569 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2570 {
2571 	struct r600_bytecode_output output;
2572 	int ring_offset;
2573 	unsigned i, k;
2574 	int effective_stream = stream == -1 ? 0 : stream;
2575 	int idx = 0;
2576 
2577 	for (i = 0; i < ctx->shader->noutput; i++) {
2578 		if (ctx->gs_for_vs) {
2579 			/* for ES we need to lookup corresponding ring offset expected by GS
2580 			 * (map this output to GS input by name and sid) */
2581 			/* FIXME precompute offsets */
2582 			ring_offset = -1;
2583 			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2584 				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2585 				struct r600_shader_io *out = &ctx->shader->output[i];
2586 				if (in->name == out->name && in->sid == out->sid)
2587 					ring_offset = in->ring_offset;
2588 			}
2589 
2590 			if (ring_offset == -1)
2591 				continue;
2592 		} else {
2593 			ring_offset = idx * 16;
2594 			idx++;
2595 		}
2596 
2597 		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2598 			continue;
2599 		/* next_ring_offset after parsing input decls contains total size of
2600 		 * single vertex data, gs_next_vertex - current vertex index */
2601 		if (!ind)
2602 			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2603 
2604 		memset(&output, 0, sizeof(struct r600_bytecode_output));
2605 		output.gpr = ctx->shader->output[i].gpr;
2606 		output.elem_size = 3;
2607 		output.comp_mask = 0xF;
2608 		output.burst_count = 1;
2609 
2610 		if (ind)
2611 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2612 		else
2613 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2614 
2615 		switch (stream) {
2616 		default:
2617 		case 0:
2618 			output.op = CF_OP_MEM_RING; break;
2619 		case 1:
2620 			output.op = CF_OP_MEM_RING1; break;
2621 		case 2:
2622 			output.op = CF_OP_MEM_RING2; break;
2623 		case 3:
2624 			output.op = CF_OP_MEM_RING3; break;
2625 		}
2626 
2627 		if (ind) {
2628 			output.array_base = ring_offset >> 2; /* in dwords */
2629 			output.array_size = 0xfff;
2630 			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2631 		} else
2632 			output.array_base = ring_offset >> 2; /* in dwords */
2633 		r600_bytecode_add_output(ctx->bc, &output);
2634 	}
2635 
2636 	++ctx->gs_next_vertex;
2637 	return 0;
2638 }
2639 
2640 
r600_fetch_tess_io_info(struct r600_shader_ctx * ctx)2641 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2642 {
2643 	int r;
2644 	struct r600_bytecode_vtx vtx;
2645 	int temp_val = ctx->temp_reg;
2646 	/* need to store the TCS output somewhere */
2647 	r = single_alu_op2(ctx, ALU_OP1_MOV,
2648 			   temp_val, 0,
2649 			   V_SQ_ALU_SRC_LITERAL, 0,
2650 			   0, 0);
2651 	if (r)
2652 		return r;
2653 
2654 	/* used by VS/TCS */
2655 	if (ctx->tess_input_info) {
2656 		/* fetch tcs input values into resv space */
2657 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2658 		vtx.op = FETCH_OP_VFETCH;
2659 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2660 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2661 		vtx.mega_fetch_count = 16;
2662 		vtx.data_format = FMT_32_32_32_32;
2663 		vtx.num_format_all = 2;
2664 		vtx.format_comp_all = 1;
2665 		vtx.use_const_fields = 0;
2666 		vtx.endian = r600_endian_swap(32);
2667 		vtx.srf_mode_all = 1;
2668 		vtx.offset = 0;
2669 		vtx.dst_gpr = ctx->tess_input_info;
2670 		vtx.dst_sel_x = 0;
2671 		vtx.dst_sel_y = 1;
2672 		vtx.dst_sel_z = 2;
2673 		vtx.dst_sel_w = 3;
2674 		vtx.src_gpr = temp_val;
2675 		vtx.src_sel_x = 0;
2676 
2677 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2678 		if (r)
2679 			return r;
2680 	}
2681 
2682 	/* used by TCS/TES */
2683 	if (ctx->tess_output_info) {
2684 		/* fetch tcs output values into resv space */
2685 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2686 		vtx.op = FETCH_OP_VFETCH;
2687 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2688 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2689 		vtx.mega_fetch_count = 16;
2690 		vtx.data_format = FMT_32_32_32_32;
2691 		vtx.num_format_all = 2;
2692 		vtx.format_comp_all = 1;
2693 		vtx.use_const_fields = 0;
2694 		vtx.endian = r600_endian_swap(32);
2695 		vtx.srf_mode_all = 1;
2696 		vtx.offset = 16;
2697 		vtx.dst_gpr = ctx->tess_output_info;
2698 		vtx.dst_sel_x = 0;
2699 		vtx.dst_sel_y = 1;
2700 		vtx.dst_sel_z = 2;
2701 		vtx.dst_sel_w = 3;
2702 		vtx.src_gpr = temp_val;
2703 		vtx.src_sel_x = 0;
2704 
2705 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2706 		if (r)
2707 			return r;
2708 	}
2709 	return 0;
2710 }
2711 
emit_lds_vs_writes(struct r600_shader_ctx * ctx)2712 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2713 {
2714 	int j, r;
2715 	int temp_reg;
2716 	unsigned i;
2717 
2718 	/* fetch tcs input values into input_vals */
2719 	ctx->tess_input_info = r600_get_temp(ctx);
2720 	ctx->tess_output_info = 0;
2721 	r = r600_fetch_tess_io_info(ctx);
2722 	if (r)
2723 		return r;
2724 
2725 	temp_reg = r600_get_temp(ctx);
2726 	/* dst reg contains LDS address stride * idx */
2727 	/* MUL vertexID, vertex_dw_stride */
2728 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2729 			   temp_reg, 0,
2730 			   ctx->tess_input_info, 1,
2731 			   0, 1); /* rel id in r0.y? */
2732 	if (r)
2733 		return r;
2734 
2735 	for (i = 0; i < ctx->shader->noutput; i++) {
2736 		struct r600_bytecode_alu alu;
2737 		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2738 
2739 		if (param) {
2740 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2741 					   temp_reg, 1,
2742 					   temp_reg, 0,
2743 					   V_SQ_ALU_SRC_LITERAL, param * 16);
2744 			if (r)
2745 				return r;
2746 		}
2747 
2748 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2749 				   temp_reg, 2,
2750 				   temp_reg, param ? 1 : 0,
2751 				   V_SQ_ALU_SRC_LITERAL, 8);
2752 		if (r)
2753 			return r;
2754 
2755 
2756 		for (j = 0; j < 2; j++) {
2757 			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2758 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2759 			alu.op = LDS_OP3_LDS_WRITE_REL;
2760 			alu.src[0].sel = temp_reg;
2761 			alu.src[0].chan = chan;
2762 			alu.src[1].sel = ctx->shader->output[i].gpr;
2763 			alu.src[1].chan = j * 2;
2764 			alu.src[2].sel = ctx->shader->output[i].gpr;
2765 			alu.src[2].chan = (j * 2) + 1;
2766 			alu.last = 1;
2767 			alu.dst.chan = 0;
2768 			alu.lds_idx = 1;
2769 			alu.is_lds_idx_op = true;
2770 			r = r600_bytecode_add_alu(ctx->bc, &alu);
2771 			if (r)
2772 				return r;
2773 		}
2774 	}
2775 	return 0;
2776 }
2777 
r600_store_tcs_output(struct r600_shader_ctx * ctx)2778 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2779 {
2780 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2781 	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2782 	int i, r, lasti;
2783 	int temp_reg = r600_get_temp(ctx);
2784 	struct r600_bytecode_alu alu;
2785 	unsigned write_mask = dst->Register.WriteMask;
2786 
2787 	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2788 		return 0;
2789 
2790 	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2791 	if (r)
2792 		return r;
2793 
2794 	/* the base address is now in temp.x */
2795 	r = r600_get_byte_address(ctx, temp_reg,
2796 				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2797 	if (r)
2798 		return r;
2799 
2800 	/* LDS write */
2801 	lasti = tgsi_last_instruction(write_mask);
2802 	for (i = 1; i <= lasti; i++) {
2803 
2804 		if (!(write_mask & (1 << i)))
2805 			continue;
2806 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2807 				   temp_reg, i,
2808 				   temp_reg, 0,
2809 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2810 		if (r)
2811 			return r;
2812 	}
2813 
2814 	for (i = 0; i <= lasti; i++) {
2815 		if (!(write_mask & (1 << i)))
2816 			continue;
2817 
2818 		if ((i == 0 && ((write_mask & 3) == 3)) ||
2819 		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
2820 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2821 			alu.op = LDS_OP3_LDS_WRITE_REL;
2822 			alu.src[0].sel = temp_reg;
2823 			alu.src[0].chan = i;
2824 
2825 			alu.src[1].sel = dst->Register.Index;
2826 			alu.src[1].sel += ctx->file_offset[dst->Register.File];
2827 			alu.src[1].chan = i;
2828 
2829 			alu.src[2].sel = dst->Register.Index;
2830 			alu.src[2].sel += ctx->file_offset[dst->Register.File];
2831 			alu.src[2].chan = i + 1;
2832 			alu.lds_idx = 1;
2833 			alu.dst.chan = 0;
2834 			alu.last = 1;
2835 			alu.is_lds_idx_op = true;
2836 			r = r600_bytecode_add_alu(ctx->bc, &alu);
2837 			if (r)
2838 				return r;
2839 			i += 1;
2840 			continue;
2841 		}
2842 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2843 		alu.op = LDS_OP2_LDS_WRITE;
2844 		alu.src[0].sel = temp_reg;
2845 		alu.src[0].chan = i;
2846 
2847 		alu.src[1].sel = dst->Register.Index;
2848 		alu.src[1].sel += ctx->file_offset[dst->Register.File];
2849 		alu.src[1].chan = i;
2850 
2851 		alu.src[2].sel = V_SQ_ALU_SRC_0;
2852 		alu.dst.chan = 0;
2853 		alu.last = 1;
2854 		alu.is_lds_idx_op = true;
2855 		r = r600_bytecode_add_alu(ctx->bc, &alu);
2856 		if (r)
2857 			return r;
2858 	}
2859 	return 0;
2860 }
2861 
r600_tess_factor_read(struct r600_shader_ctx * ctx,int output_idx,int nc)2862 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2863 				 int output_idx, int nc)
2864 {
2865 	int param;
2866 	unsigned temp_reg = r600_get_temp(ctx);
2867 	unsigned name = ctx->shader->output[output_idx].name;
2868 	int dreg = ctx->shader->output[output_idx].gpr;
2869 	int r;
2870 
2871 	param = r600_get_lds_unique_index(name, 0);
2872 	r = get_lds_offset0(ctx, 1, temp_reg, true);
2873 	if (r)
2874 		return r;
2875 
2876 	if (param) {
2877 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2878 				   temp_reg, 0,
2879 				   temp_reg, 0,
2880 				   V_SQ_ALU_SRC_LITERAL, param * 16);
2881 		if (r)
2882 			return r;
2883 	}
2884 
2885 	do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
2886 	return 0;
2887 }
2888 
r600_emit_tess_factor(struct r600_shader_ctx * ctx)2889 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2890 {
2891 	int stride, outer_comps, inner_comps;
2892 	int tessinner_idx = -1, tessouter_idx = -1;
2893 	int i, r;
2894 	unsigned j;
2895 	int temp_reg = r600_get_temp(ctx);
2896 	int treg[3] = {-1, -1, -1};
2897 	struct r600_bytecode_alu alu;
2898 	struct r600_bytecode_cf *cf_jump, *cf_pop;
2899 
2900 	/* only execute factor emission for invocation 0 */
2901 	/* PRED_SETE_INT __, R0.x, 0 */
2902 	memset(&alu, 0, sizeof(alu));
2903 	alu.op = ALU_OP2_PRED_SETE_INT;
2904 	alu.src[0].chan = 2;
2905 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2906 	alu.execute_mask = 1;
2907 	alu.update_pred = 1;
2908 	alu.last = 1;
2909 	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2910 
2911 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2912 	cf_jump = ctx->bc->cf_last;
2913 
2914 	treg[0] = r600_get_temp(ctx);
2915 	switch (ctx->shader->tcs_prim_mode) {
2916 	case PIPE_PRIM_LINES:
2917 		stride = 8; /* 2 dwords, 1 vec2 store */
2918 		outer_comps = 2;
2919 		inner_comps = 0;
2920 		break;
2921 	case PIPE_PRIM_TRIANGLES:
2922 		stride = 16; /* 4 dwords, 1 vec4 store */
2923 		outer_comps = 3;
2924 		inner_comps = 1;
2925 		treg[1] = r600_get_temp(ctx);
2926 		break;
2927 	case PIPE_PRIM_QUADS:
2928 		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2929 		outer_comps = 4;
2930 		inner_comps = 2;
2931 		treg[1] = r600_get_temp(ctx);
2932 		treg[2] = r600_get_temp(ctx);
2933 		break;
2934 	default:
2935 		assert(0);
2936 		return -1;
2937 	}
2938 
2939 	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2940 	/* TF_WRITE takes index in R.x, value in R.y */
2941 	for (j = 0; j < ctx->shader->noutput; j++) {
2942 		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2943 			tessinner_idx = j;
2944 		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2945 			tessouter_idx = j;
2946 	}
2947 
2948 	if (tessouter_idx == -1)
2949 		return -1;
2950 
2951 	if (tessinner_idx == -1 && inner_comps)
2952 		return -1;
2953 
2954 	if (tessouter_idx != -1) {
2955 		r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
2956 		if (r)
2957 			return r;
2958 	}
2959 
2960 	if (tessinner_idx != -1) {
2961 		r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
2962 		if (r)
2963 			return r;
2964 	}
2965 
2966 	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2967 	/* r.x = relpatchid(r0.y) * tf_stride */
2968 
2969 	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
2970 	/* add incoming r0.w to it: t.x = t.x + r0.w */
2971 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2972 			   temp_reg, 0,
2973 			   0, 1,
2974 			   V_SQ_ALU_SRC_LITERAL, stride,
2975 			   0, 3);
2976 	if (r)
2977 		return r;
2978 
2979 	for (i = 0; i < outer_comps + inner_comps; i++) {
2980 		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2981 		int out_comp = i >= outer_comps ? i - outer_comps : i;
2982 
2983 		if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
2984 			if (out_comp == 1)
2985 				out_comp = 0;
2986 			else if (out_comp == 0)
2987 				out_comp = 1;
2988 		}
2989 
2990 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2991 				   treg[i / 2], (2 * (i % 2)),
2992 				   temp_reg, 0,
2993 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2994 		if (r)
2995 			return r;
2996 		r = single_alu_op2(ctx, ALU_OP1_MOV,
2997 				   treg[i / 2], 1 + (2 * (i%2)),
2998 				   ctx->shader->output[out_idx].gpr, out_comp,
2999 				   0, 0);
3000 		if (r)
3001 			return r;
3002 	}
3003 	for (i = 0; i < outer_comps + inner_comps; i++) {
3004 		struct r600_bytecode_gds gds;
3005 
3006 		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3007 		gds.src_gpr = treg[i / 2];
3008 		gds.src_sel_x = 2 * (i % 2);
3009 		gds.src_sel_y = 1 + (2 * (i % 2));
3010 		gds.src_sel_z = 4;
3011 		gds.dst_sel_x = 7;
3012 		gds.dst_sel_y = 7;
3013 		gds.dst_sel_z = 7;
3014 		gds.dst_sel_w = 7;
3015 		gds.op = FETCH_OP_TF_WRITE;
3016 		r = r600_bytecode_add_gds(ctx->bc, &gds);
3017 		if (r)
3018 			return r;
3019 	}
3020 
3021 	// Patch up jump label
3022 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3023 	cf_pop = ctx->bc->cf_last;
3024 
3025 	cf_jump->cf_addr = cf_pop->id + 2;
3026 	cf_jump->pop_count = 1;
3027 	cf_pop->cf_addr = cf_pop->id + 2;
3028 	cf_pop->pop_count = 1;
3029 
3030 	return 0;
3031 }
3032 
3033 /*
3034  * We have to work out the thread ID for load and atomic
3035  * operations, which store the returned value to an index
3036  * in an intermediate buffer.
3037  * The index is calculated by taking the thread id,
3038  * calculated from the MBCNT instructions.
3039  * Then the shader engine ID is multiplied by 256,
3040  * and the wave id is added.
3041  * Then the result is multipled by 64 and thread id is
3042  * added.
3043  */
load_thread_id_gpr(struct r600_shader_ctx * ctx)3044 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3045 {
3046 	struct r600_bytecode_alu alu;
3047 	int r;
3048 
3049 	if (ctx->thread_id_gpr_loaded)
3050 		return 0;
3051 
3052 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3053 	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3054 	alu.dst.sel = ctx->temp_reg;
3055 	alu.dst.chan = 0;
3056 	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3057 	alu.src[0].value = 0xffffffff;
3058 	alu.dst.write = 1;
3059 	r = r600_bytecode_add_alu(ctx->bc, &alu);
3060 	if (r)
3061 		return r;
3062 
3063 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3064 	alu.op = ALU_OP1_MBCNT_32HI_INT;
3065 	alu.dst.sel = ctx->temp_reg;
3066 	alu.dst.chan = 1;
3067 	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3068 	alu.src[0].value = 0xffffffff;
3069 	alu.dst.write = 1;
3070 	r = r600_bytecode_add_alu(ctx->bc, &alu);
3071 	if (r)
3072 		return r;
3073 
3074 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3075 	alu.op = ALU_OP3_MULADD_UINT24;
3076 	alu.dst.sel = ctx->temp_reg;
3077 	alu.dst.chan = 2;
3078 	alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3079 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3080 	alu.src[1].value = 256;
3081 	alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3082 	alu.dst.write = 1;
3083 	alu.is_op3 = 1;
3084 	alu.last = 1;
3085 	r = r600_bytecode_add_alu(ctx->bc, &alu);
3086 	if (r)
3087 		return r;
3088 
3089 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3090 			   ctx->thread_id_gpr, 1,
3091 			   ctx->temp_reg, 2,
3092 			   V_SQ_ALU_SRC_LITERAL, 0x40,
3093 			   ctx->temp_reg, 0);
3094 	if (r)
3095 		return r;
3096 	ctx->thread_id_gpr_loaded = true;
3097 	return 0;
3098 }
3099 
r600_shader_from_tgsi(struct r600_context * rctx,struct r600_pipe_shader * pipeshader,union r600_shader_key key)3100 static int r600_shader_from_tgsi(struct r600_context *rctx,
3101 				 struct r600_pipe_shader *pipeshader,
3102 				 union r600_shader_key key)
3103 {
3104 	struct r600_screen *rscreen = rctx->screen;
3105 	struct r600_shader *shader = &pipeshader->shader;
3106 	struct tgsi_token *tokens = pipeshader->selector->tokens;
3107 	struct pipe_stream_output_info so = pipeshader->selector->so;
3108 	struct tgsi_full_immediate *immediate;
3109 	struct r600_shader_ctx ctx;
3110 	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3111 	unsigned output_done, noutput;
3112 	unsigned opcode;
3113 	int j, k, r = 0;
3114 	unsigned i;
3115 	int next_param_base = 0, next_clip_base;
3116 	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3117 	bool indirect_gprs;
3118 	bool ring_outputs = false;
3119 	bool lds_outputs = false;
3120 	bool lds_inputs = false;
3121 	bool pos_emitted = false;
3122 
3123 	ctx.bc = &shader->bc;
3124 	ctx.shader = shader;
3125 
3126 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3127 			   rscreen->has_compressed_msaa_texturing);
3128 	ctx.tokens = tokens;
3129 	tgsi_scan_shader(tokens, &ctx.info);
3130 	shader->indirect_files = ctx.info.indirect_files;
3131 
3132 	shader->uses_doubles = ctx.info.uses_doubles;
3133 	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3134 	shader->nsys_inputs = 0;
3135 
3136 	shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3137 		ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3138 	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3139 	tgsi_parse_init(&ctx.parse, tokens);
3140 	ctx.type = ctx.info.processor;
3141 	shader->processor_type = ctx.type;
3142 	ctx.bc->type = shader->processor_type;
3143 
3144 	switch (ctx.type) {
3145 	case PIPE_SHADER_VERTEX:
3146 		shader->vs_as_gs_a = key.vs.as_gs_a;
3147 		shader->vs_as_es = key.vs.as_es;
3148 		shader->vs_as_ls = key.vs.as_ls;
3149 		shader->atomic_base = key.vs.first_atomic_counter;
3150 		if (shader->vs_as_es)
3151 			ring_outputs = true;
3152 		if (shader->vs_as_ls)
3153 			lds_outputs = true;
3154 		break;
3155 	case PIPE_SHADER_GEOMETRY:
3156 		ring_outputs = true;
3157 		shader->atomic_base = key.gs.first_atomic_counter;
3158 		shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3159 		break;
3160 	case PIPE_SHADER_TESS_CTRL:
3161 		shader->tcs_prim_mode = key.tcs.prim_mode;
3162 		shader->atomic_base = key.tcs.first_atomic_counter;
3163 		lds_outputs = true;
3164 		lds_inputs = true;
3165 		break;
3166 	case PIPE_SHADER_TESS_EVAL:
3167 		shader->tes_as_es = key.tes.as_es;
3168 		shader->atomic_base = key.tes.first_atomic_counter;
3169 		lds_inputs = true;
3170 		if (shader->tes_as_es)
3171 			ring_outputs = true;
3172 		break;
3173 	case PIPE_SHADER_FRAGMENT:
3174 		shader->two_side = key.ps.color_two_side;
3175 		shader->atomic_base = key.ps.first_atomic_counter;
3176 		shader->rat_base = key.ps.nr_cbufs;
3177 		shader->image_size_const_offset = key.ps.image_size_const_offset;
3178 		break;
3179 	case PIPE_SHADER_COMPUTE:
3180 		shader->rat_base = 0;
3181 		shader->image_size_const_offset = 0;
3182 		break;
3183 	default:
3184 		break;
3185 	}
3186 
3187 	if (shader->vs_as_es || shader->tes_as_es) {
3188 		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3189 	} else {
3190 		ctx.gs_for_vs = NULL;
3191 	}
3192 
3193 	ctx.next_ring_offset = 0;
3194 	ctx.gs_out_ring_offset = 0;
3195 	ctx.gs_next_vertex = 0;
3196 	ctx.gs_stream_output_info = &so;
3197 
3198 	ctx.face_gpr = -1;
3199 	ctx.fixed_pt_position_gpr = -1;
3200 	ctx.fragcoord_input = -1;
3201 	ctx.colors_used = 0;
3202 	ctx.clip_vertex_write = 0;
3203 	ctx.thread_id_gpr_loaded = false;
3204 
3205 	ctx.cs_block_size_reg = -1;
3206 	ctx.cs_grid_size_reg = -1;
3207 	ctx.cs_block_size_loaded = false;
3208 	ctx.cs_grid_size_loaded = false;
3209 
3210 	shader->nr_ps_color_exports = 0;
3211 	shader->nr_ps_max_color_exports = 0;
3212 
3213 
3214 	/* register allocations */
3215 	/* Values [0,127] correspond to GPR[0..127].
3216 	 * Values [128,159] correspond to constant buffer bank 0
3217 	 * Values [160,191] correspond to constant buffer bank 1
3218 	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3219 	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3220 	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3221 	 * Other special values are shown in the list below.
3222 	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3223 	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3224 	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3225 	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3226 	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3227 	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3228 	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3229 	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3230 	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3231 	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3232 	 * 254	SQ_ALU_SRC_PV: previous vector result.
3233 	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3234 	 */
3235 	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3236 		ctx.file_offset[i] = 0;
3237 	}
3238 
3239 	if (ctx.type == PIPE_SHADER_VERTEX)  {
3240 
3241 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3242 		if (ctx.info.num_inputs)
3243 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3244 	}
3245 	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3246 		if (ctx.bc->chip_class >= EVERGREEN)
3247 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3248 		else
3249 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3250 	}
3251 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3252 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3253 		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3254 	}
3255 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3256 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3257 	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3258 		bool add_tesscoord = false, add_tess_inout = false;
3259 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3260 		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3261 			/* if we have tesscoord save one reg */
3262 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3263 				add_tesscoord = true;
3264 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3265 			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3266 				add_tess_inout = true;
3267 		}
3268 		if (add_tesscoord || add_tess_inout)
3269 			ctx.file_offset[TGSI_FILE_INPUT]++;
3270 		if (add_tess_inout)
3271 			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3272 	}
3273 	if (ctx.type == PIPE_SHADER_COMPUTE) {
3274 		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3275 		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3276 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3277 				ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3278 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3279 				ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3280 		}
3281 	}
3282 
3283 	ctx.file_offset[TGSI_FILE_OUTPUT] =
3284 			ctx.file_offset[TGSI_FILE_INPUT] +
3285 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3286 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3287 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3288 
3289 	/* Outside the GPR range. This will be translated to one of the
3290 	 * kcache banks later. */
3291 	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3292 
3293 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3294 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3295 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3296 	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3297 	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3298 
3299 	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3300 		ctx.tess_input_info = ctx.bc->ar_reg + 3;
3301 		ctx.tess_output_info = ctx.bc->ar_reg + 4;
3302 		ctx.temp_reg = ctx.bc->ar_reg + 5;
3303 	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3304 		ctx.tess_input_info = 0;
3305 		ctx.tess_output_info = ctx.bc->ar_reg + 3;
3306 		ctx.temp_reg = ctx.bc->ar_reg + 4;
3307 	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3308 		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3309 		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3310 		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3311 		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3312 		ctx.temp_reg = ctx.bc->ar_reg + 7;
3313 		if (ctx.shader->gs_tri_strip_adj_fix) {
3314 			ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
3315 			ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
3316 			ctx.temp_reg += 2;
3317 		} else {
3318 			ctx.gs_rotated_input[0] = 0;
3319 			ctx.gs_rotated_input[1] = 1;
3320 		}
3321 	} else {
3322 		ctx.temp_reg = ctx.bc->ar_reg + 3;
3323 	}
3324 
3325 	if (shader->uses_images) {
3326 		ctx.thread_id_gpr = ctx.temp_reg++;
3327 		ctx.thread_id_gpr_loaded = false;
3328 	}
3329 
3330 	shader->max_arrays = 0;
3331 	shader->num_arrays = 0;
3332 	if (indirect_gprs) {
3333 
3334 		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3335 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3336 			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3337 			                   ctx.file_offset[TGSI_FILE_INPUT],
3338 			                   0x0F);
3339 		}
3340 		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3341 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3342 			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3343 			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3344 			                   0x0F);
3345 		}
3346 	}
3347 
3348 	ctx.nliterals = 0;
3349 	ctx.literals = NULL;
3350 	ctx.max_driver_temp_used = 0;
3351 
3352 	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3353 			       ctx.info.colors_written == 1;
3354 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3355 	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3356 
3357 	if (ctx.type == PIPE_SHADER_VERTEX ||
3358 	    ctx.type == PIPE_SHADER_GEOMETRY ||
3359 	    ctx.type == PIPE_SHADER_TESS_EVAL) {
3360 		shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3361 					      ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3362 		shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3363 		shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3364 	}
3365 
3366 	if (shader->vs_as_gs_a)
3367 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3368 
3369 	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3370 		r600_fetch_tess_io_info(&ctx);
3371 
3372 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3373 		tgsi_parse_token(&ctx.parse);
3374 		switch (ctx.parse.FullToken.Token.Type) {
3375 		case TGSI_TOKEN_TYPE_IMMEDIATE:
3376 			immediate = &ctx.parse.FullToken.FullImmediate;
3377 			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3378 			if(ctx.literals == NULL) {
3379 				r = -ENOMEM;
3380 				goto out_err;
3381 			}
3382 			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3383 			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3384 			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3385 			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3386 			ctx.nliterals++;
3387 			break;
3388 		case TGSI_TOKEN_TYPE_DECLARATION:
3389 			r = tgsi_declaration(&ctx);
3390 			if (r)
3391 				goto out_err;
3392 			break;
3393 		case TGSI_TOKEN_TYPE_INSTRUCTION:
3394 		case TGSI_TOKEN_TYPE_PROPERTY:
3395 			break;
3396 		default:
3397 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3398 			r = -EINVAL;
3399 			goto out_err;
3400 		}
3401 	}
3402 
3403 	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3404 	shader->ring_item_sizes[1] = 0;
3405 	shader->ring_item_sizes[2] = 0;
3406 	shader->ring_item_sizes[3] = 0;
3407 
3408 	/* Process two side if needed */
3409 	if (shader->two_side && ctx.colors_used) {
3410 		int i, count = ctx.shader->ninput;
3411 		unsigned next_lds_loc = ctx.shader->nlds;
3412 
3413 		/* additional inputs will be allocated right after the existing inputs,
3414 		 * we won't need them after the color selection, so we don't need to
3415 		 * reserve these gprs for the rest of the shader code and to adjust
3416 		 * output offsets etc. */
3417 		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3418 				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3419 
3420 		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3421 		if (ctx.face_gpr == -1) {
3422 			i = ctx.shader->ninput++;
3423 			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3424 			ctx.shader->input[i].spi_sid = 0;
3425 			ctx.shader->input[i].gpr = gpr++;
3426 			ctx.face_gpr = ctx.shader->input[i].gpr;
3427 		}
3428 
3429 		for (i = 0; i < count; i++) {
3430 			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3431 				int ni = ctx.shader->ninput++;
3432 				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3433 				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3434 				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3435 				ctx.shader->input[ni].gpr = gpr++;
3436 				// TGSI to LLVM needs to know the lds position of inputs.
3437 				// Non LLVM path computes it later (in process_twoside_color)
3438 				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3439 				ctx.shader->input[i].back_color_input = ni;
3440 				if (ctx.bc->chip_class >= EVERGREEN) {
3441 					if ((r = evergreen_interp_input(&ctx, ni)))
3442 						return r;
3443 				}
3444 			}
3445 		}
3446 	}
3447 
3448 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3449 		shader->nr_ps_max_color_exports = 8;
3450 
3451 	if (ctx.fragcoord_input >= 0) {
3452 		if (ctx.bc->chip_class == CAYMAN) {
3453 			for (j = 0 ; j < 4; j++) {
3454 				struct r600_bytecode_alu alu;
3455 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3456 				alu.op = ALU_OP1_RECIP_IEEE;
3457 				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3458 				alu.src[0].chan = 3;
3459 
3460 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3461 				alu.dst.chan = j;
3462 				alu.dst.write = (j == 3);
3463 				alu.last = (j == 3);
3464 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3465 					return r;
3466 			}
3467 		} else {
3468 			struct r600_bytecode_alu alu;
3469 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3470 			alu.op = ALU_OP1_RECIP_IEEE;
3471 			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3472 			alu.src[0].chan = 3;
3473 
3474 			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3475 			alu.dst.chan = 3;
3476 			alu.dst.write = 1;
3477 			alu.last = 1;
3478 			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3479 				return r;
3480 		}
3481 	}
3482 
3483 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3484 		struct r600_bytecode_alu alu;
3485 		int r;
3486 
3487 		/* GS thread with no output workaround - emit a cut at start of GS */
3488 		if (ctx.bc->chip_class == R600)
3489 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3490 
3491 		for (j = 0; j < 4; j++) {
3492 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3493 			alu.op = ALU_OP1_MOV;
3494 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3495 			alu.src[0].value = 0;
3496 			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3497 			alu.dst.write = 1;
3498 			alu.last = 1;
3499 			r = r600_bytecode_add_alu(ctx.bc, &alu);
3500 			if (r)
3501 				return r;
3502 		}
3503 
3504 		if (ctx.shader->gs_tri_strip_adj_fix) {
3505 			r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3506 					   ctx.gs_rotated_input[0], 2,
3507 					   0, 2,
3508 					   V_SQ_ALU_SRC_LITERAL, 1);
3509 			if (r)
3510 				return r;
3511 
3512 			for (i = 0; i < 6; i++) {
3513 				int rotated = (i + 4) % 6;
3514 				int offset_reg = i / 3;
3515 				int offset_chan = i % 3;
3516 				int rotated_offset_reg = rotated / 3;
3517 				int rotated_offset_chan = rotated % 3;
3518 
3519 				if (offset_reg == 0 && offset_chan == 2)
3520 					offset_chan = 3;
3521 				if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3522 					rotated_offset_chan = 3;
3523 
3524 				r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3525 						   ctx.gs_rotated_input[offset_reg], offset_chan,
3526 						   ctx.gs_rotated_input[0], 2,
3527 						   offset_reg, offset_chan,
3528 						   rotated_offset_reg, rotated_offset_chan);
3529 				if (r)
3530 					return r;
3531 			}
3532 		}
3533 	}
3534 
3535 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3536 		r600_fetch_tess_io_info(&ctx);
3537 
3538 	if (shader->two_side && ctx.colors_used) {
3539 		if ((r = process_twoside_color_inputs(&ctx)))
3540 			return r;
3541 	}
3542 
3543 	tgsi_parse_init(&ctx.parse, tokens);
3544 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3545 		tgsi_parse_token(&ctx.parse);
3546 		switch (ctx.parse.FullToken.Token.Type) {
3547 		case TGSI_TOKEN_TYPE_INSTRUCTION:
3548 			r = tgsi_is_supported(&ctx);
3549 			if (r)
3550 				goto out_err;
3551 			ctx.max_driver_temp_used = 0;
3552 			/* reserve first tmp for everyone */
3553 			r600_get_temp(&ctx);
3554 
3555 			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3556 			if ((r = tgsi_split_constant(&ctx)))
3557 				goto out_err;
3558 			if ((r = tgsi_split_literal_constant(&ctx)))
3559 				goto out_err;
3560 			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3561 				if ((r = tgsi_split_gs_inputs(&ctx)))
3562 					goto out_err;
3563 			} else if (lds_inputs) {
3564 				if ((r = tgsi_split_lds_inputs(&ctx)))
3565 					goto out_err;
3566 			}
3567 			if (ctx.bc->chip_class == CAYMAN)
3568 				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3569 			else if (ctx.bc->chip_class >= EVERGREEN)
3570 				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3571 			else
3572 				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3573 			r = ctx.inst_info->process(&ctx);
3574 			if (r)
3575 				goto out_err;
3576 
3577 			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3578 				r = r600_store_tcs_output(&ctx);
3579 				if (r)
3580 					goto out_err;
3581 			}
3582 			break;
3583 		default:
3584 			break;
3585 		}
3586 	}
3587 
3588 	/* Reset the temporary register counter. */
3589 	ctx.max_driver_temp_used = 0;
3590 
3591 	noutput = shader->noutput;
3592 
3593 	if (!ring_outputs && ctx.clip_vertex_write) {
3594 		unsigned clipdist_temp[2];
3595 
3596 		clipdist_temp[0] = r600_get_temp(&ctx);
3597 		clipdist_temp[1] = r600_get_temp(&ctx);
3598 
3599 		/* need to convert a clipvertex write into clipdistance writes and not export
3600 		   the clip vertex anymore */
3601 
3602 		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3603 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3604 		shader->output[noutput].gpr = clipdist_temp[0];
3605 		noutput++;
3606 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3607 		shader->output[noutput].gpr = clipdist_temp[1];
3608 		noutput++;
3609 
3610 		/* reset spi_sid for clipvertex output to avoid confusing spi */
3611 		shader->output[ctx.cv_output].spi_sid = 0;
3612 
3613 		shader->clip_dist_write = 0xFF;
3614 		shader->cc_dist_mask = 0xFF;
3615 
3616 		for (i = 0; i < 8; i++) {
3617 			int oreg = i >> 2;
3618 			int ochan = i & 3;
3619 
3620 			for (j = 0; j < 4; j++) {
3621 				struct r600_bytecode_alu alu;
3622 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3623 				alu.op = ALU_OP2_DOT4;
3624 				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3625 				alu.src[0].chan = j;
3626 
3627 				alu.src[1].sel = 512 + i;
3628 				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3629 				alu.src[1].chan = j;
3630 
3631 				alu.dst.sel = clipdist_temp[oreg];
3632 				alu.dst.chan = j;
3633 				alu.dst.write = (j == ochan);
3634 				if (j == 3)
3635 					alu.last = 1;
3636 				r = r600_bytecode_add_alu(ctx.bc, &alu);
3637 				if (r)
3638 					return r;
3639 			}
3640 		}
3641 	}
3642 
3643 	/* Add stream outputs. */
3644 	if (so.num_outputs) {
3645 		bool emit = false;
3646 		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3647 			emit = true;
3648 		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3649 			emit = true;
3650 		if (emit)
3651 			emit_streamout(&ctx, &so, -1, NULL);
3652 	}
3653 	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3654 	convert_edgeflag_to_int(&ctx);
3655 
3656 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3657 		r600_emit_tess_factor(&ctx);
3658 
3659 	if (lds_outputs) {
3660 		if (ctx.type == PIPE_SHADER_VERTEX) {
3661 			if (ctx.shader->noutput)
3662 				emit_lds_vs_writes(&ctx);
3663 		}
3664 	} else if (ring_outputs) {
3665 		if (shader->vs_as_es || shader->tes_as_es) {
3666 			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3667 			ctx.gs_export_gpr_tregs[1] = -1;
3668 			ctx.gs_export_gpr_tregs[2] = -1;
3669 			ctx.gs_export_gpr_tregs[3] = -1;
3670 
3671 			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3672 		}
3673 	} else {
3674 		/* Export output */
3675 		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3676 
3677 		for (i = 0, j = 0; i < noutput; i++, j++) {
3678 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3679 			output[j].gpr = shader->output[i].gpr;
3680 			output[j].elem_size = 3;
3681 			output[j].swizzle_x = 0;
3682 			output[j].swizzle_y = 1;
3683 			output[j].swizzle_z = 2;
3684 			output[j].swizzle_w = 3;
3685 			output[j].burst_count = 1;
3686 			output[j].type = 0xffffffff;
3687 			output[j].op = CF_OP_EXPORT;
3688 			switch (ctx.type) {
3689 			case PIPE_SHADER_VERTEX:
3690 			case PIPE_SHADER_TESS_EVAL:
3691 				switch (shader->output[i].name) {
3692 				case TGSI_SEMANTIC_POSITION:
3693 					output[j].array_base = 60;
3694 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3695 					pos_emitted = true;
3696 					break;
3697 
3698 				case TGSI_SEMANTIC_PSIZE:
3699 					output[j].array_base = 61;
3700 					output[j].swizzle_y = 7;
3701 					output[j].swizzle_z = 7;
3702 					output[j].swizzle_w = 7;
3703 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3704 					pos_emitted = true;
3705 					break;
3706 				case TGSI_SEMANTIC_EDGEFLAG:
3707 					output[j].array_base = 61;
3708 					output[j].swizzle_x = 7;
3709 					output[j].swizzle_y = 0;
3710 					output[j].swizzle_z = 7;
3711 					output[j].swizzle_w = 7;
3712 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3713 					pos_emitted = true;
3714 					break;
3715 				case TGSI_SEMANTIC_LAYER:
3716 					/* spi_sid is 0 for outputs that are
3717 					 * not consumed by PS */
3718 					if (shader->output[i].spi_sid) {
3719 						output[j].array_base = next_param_base++;
3720 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3721 						j++;
3722 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3723 					}
3724 					output[j].array_base = 61;
3725 					output[j].swizzle_x = 7;
3726 					output[j].swizzle_y = 7;
3727 					output[j].swizzle_z = 0;
3728 					output[j].swizzle_w = 7;
3729 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3730 					pos_emitted = true;
3731 					break;
3732 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
3733 					/* spi_sid is 0 for outputs that are
3734 					 * not consumed by PS */
3735 					if (shader->output[i].spi_sid) {
3736 						output[j].array_base = next_param_base++;
3737 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3738 						j++;
3739 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3740 					}
3741 					output[j].array_base = 61;
3742 					output[j].swizzle_x = 7;
3743 					output[j].swizzle_y = 7;
3744 					output[j].swizzle_z = 7;
3745 					output[j].swizzle_w = 0;
3746 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3747 					pos_emitted = true;
3748 					break;
3749 				case TGSI_SEMANTIC_CLIPVERTEX:
3750 					j--;
3751 					break;
3752 				case TGSI_SEMANTIC_CLIPDIST:
3753 					output[j].array_base = next_clip_base++;
3754 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3755 					pos_emitted = true;
3756 					/* spi_sid is 0 for clipdistance outputs that were generated
3757 					 * for clipvertex - we don't need to pass them to PS */
3758 					if (shader->output[i].spi_sid) {
3759 						j++;
3760 						/* duplicate it as PARAM to pass to the pixel shader */
3761 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3762 						output[j].array_base = next_param_base++;
3763 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3764 					}
3765 					break;
3766 				case TGSI_SEMANTIC_FOG:
3767 					output[j].swizzle_y = 4; /* 0 */
3768 					output[j].swizzle_z = 4; /* 0 */
3769 					output[j].swizzle_w = 5; /* 1 */
3770 					break;
3771 				case TGSI_SEMANTIC_PRIMID:
3772 					output[j].swizzle_x = 2;
3773 					output[j].swizzle_y = 4; /* 0 */
3774 					output[j].swizzle_z = 4; /* 0 */
3775 					output[j].swizzle_w = 4; /* 0 */
3776 					break;
3777 				}
3778 
3779 				break;
3780 			case PIPE_SHADER_FRAGMENT:
3781 				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3782 					/* never export more colors than the number of CBs */
3783 					if (shader->output[i].sid >= max_color_exports) {
3784 						/* skip export */
3785 						j--;
3786 						continue;
3787 					}
3788 					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3789 					output[j].array_base = shader->output[i].sid;
3790 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3791 					shader->nr_ps_color_exports++;
3792 					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3793 						for (k = 1; k < max_color_exports; k++) {
3794 							j++;
3795 							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3796 							output[j].gpr = shader->output[i].gpr;
3797 							output[j].elem_size = 3;
3798 							output[j].swizzle_x = 0;
3799 							output[j].swizzle_y = 1;
3800 							output[j].swizzle_z = 2;
3801 							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3802 							output[j].burst_count = 1;
3803 							output[j].array_base = k;
3804 							output[j].op = CF_OP_EXPORT;
3805 							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3806 							shader->nr_ps_color_exports++;
3807 						}
3808 					}
3809 				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3810 					output[j].array_base = 61;
3811 					output[j].swizzle_x = 2;
3812 					output[j].swizzle_y = 7;
3813 					output[j].swizzle_z = output[j].swizzle_w = 7;
3814 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3815 				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3816 					output[j].array_base = 61;
3817 					output[j].swizzle_x = 7;
3818 					output[j].swizzle_y = 1;
3819 					output[j].swizzle_z = output[j].swizzle_w = 7;
3820 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3821 				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3822 					output[j].array_base = 61;
3823 					output[j].swizzle_x = 7;
3824 					output[j].swizzle_y = 7;
3825 					output[j].swizzle_z = 0;
3826 					output[j].swizzle_w = 7;
3827 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3828 				} else {
3829 					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3830 					r = -EINVAL;
3831 					goto out_err;
3832 				}
3833 				break;
3834 			case PIPE_SHADER_TESS_CTRL:
3835 				break;
3836 			default:
3837 				R600_ERR("unsupported processor type %d\n", ctx.type);
3838 				r = -EINVAL;
3839 				goto out_err;
3840 			}
3841 
3842 			if (output[j].type == 0xffffffff) {
3843 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3844 				output[j].array_base = next_param_base++;
3845 			}
3846 		}
3847 
3848 		/* add fake position export */
3849 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3850 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3851 			output[j].gpr = 0;
3852 			output[j].elem_size = 3;
3853 			output[j].swizzle_x = 7;
3854 			output[j].swizzle_y = 7;
3855 			output[j].swizzle_z = 7;
3856 			output[j].swizzle_w = 7;
3857 			output[j].burst_count = 1;
3858 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3859 			output[j].array_base = 60;
3860 			output[j].op = CF_OP_EXPORT;
3861 			j++;
3862 		}
3863 
3864 		/* add fake param output for vertex shader if no param is exported */
3865 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3866 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3867 			output[j].gpr = 0;
3868 			output[j].elem_size = 3;
3869 			output[j].swizzle_x = 7;
3870 			output[j].swizzle_y = 7;
3871 			output[j].swizzle_z = 7;
3872 			output[j].swizzle_w = 7;
3873 			output[j].burst_count = 1;
3874 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3875 			output[j].array_base = 0;
3876 			output[j].op = CF_OP_EXPORT;
3877 			j++;
3878 		}
3879 
3880 		/* add fake pixel export */
3881 		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3882 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3883 			output[j].gpr = 0;
3884 			output[j].elem_size = 3;
3885 			output[j].swizzle_x = 7;
3886 			output[j].swizzle_y = 7;
3887 			output[j].swizzle_z = 7;
3888 			output[j].swizzle_w = 7;
3889 			output[j].burst_count = 1;
3890 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3891 			output[j].array_base = 0;
3892 			output[j].op = CF_OP_EXPORT;
3893 			j++;
3894 			shader->nr_ps_color_exports++;
3895 		}
3896 
3897 		noutput = j;
3898 
3899 		/* set export done on last export of each type */
3900 		for (k = noutput - 1, output_done = 0; k >= 0; k--) {
3901 			if (!(output_done & (1 << output[k].type))) {
3902 				output_done |= (1 << output[k].type);
3903 				output[k].op = CF_OP_EXPORT_DONE;
3904 			}
3905 		}
3906 		/* add output to bytecode */
3907 		for (i = 0; i < noutput; i++) {
3908 			r = r600_bytecode_add_output(ctx.bc, &output[i]);
3909 			if (r)
3910 				goto out_err;
3911 		}
3912 	}
3913 
3914 	/* add program end */
3915 	if (ctx.bc->chip_class == CAYMAN)
3916 		cm_bytecode_add_cf_end(ctx.bc);
3917 	else {
3918 		const struct cf_op_info *last = NULL;
3919 
3920 		if (ctx.bc->cf_last)
3921 			last = r600_isa_cf(ctx.bc->cf_last->op);
3922 
3923 		/* alu clause instructions don't have EOP bit, so add NOP */
3924 		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
3925 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3926 
3927 		ctx.bc->cf_last->end_of_program = 1;
3928 	}
3929 
3930 	/* check GPR limit - we have 124 = 128 - 4
3931 	 * (4 are reserved as alu clause temporary registers) */
3932 	if (ctx.bc->ngpr > 124) {
3933 		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3934 		r = -ENOMEM;
3935 		goto out_err;
3936 	}
3937 
3938 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3939 		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3940 			return r;
3941 	}
3942 
3943 	free(ctx.literals);
3944 	tgsi_parse_free(&ctx.parse);
3945 	return 0;
3946 out_err:
3947 	free(ctx.literals);
3948 	tgsi_parse_free(&ctx.parse);
3949 	return r;
3950 }
3951 
tgsi_unsupported(struct r600_shader_ctx * ctx)3952 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3953 {
3954 	const unsigned tgsi_opcode =
3955 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3956 	R600_ERR("%s tgsi opcode unsupported\n",
3957 		 tgsi_get_opcode_name(tgsi_opcode));
3958 	return -EINVAL;
3959 }
3960 
tgsi_end(struct r600_shader_ctx * ctx UNUSED)3961 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
3962 {
3963 	return 0;
3964 }
3965 
r600_bytecode_src(struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src,unsigned chan)3966 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3967 			const struct r600_shader_src *shader_src,
3968 			unsigned chan)
3969 {
3970 	bc_src->sel = shader_src->sel;
3971 	bc_src->chan = shader_src->swizzle[chan];
3972 	bc_src->neg = shader_src->neg;
3973 	bc_src->abs = shader_src->abs;
3974 	bc_src->rel = shader_src->rel;
3975 	bc_src->value = shader_src->value[bc_src->chan];
3976 	bc_src->kc_bank = shader_src->kc_bank;
3977 	bc_src->kc_rel = shader_src->kc_rel;
3978 }
3979 
r600_bytecode_src_set_abs(struct r600_bytecode_alu_src * bc_src)3980 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3981 {
3982 	bc_src->abs = 1;
3983 	bc_src->neg = 0;
3984 }
3985 
r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src * bc_src)3986 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3987 {
3988 	bc_src->neg = !bc_src->neg;
3989 }
3990 
tgsi_dst(struct r600_shader_ctx * ctx,const struct tgsi_full_dst_register * tgsi_dst,unsigned swizzle,struct r600_bytecode_alu_dst * r600_dst)3991 static void tgsi_dst(struct r600_shader_ctx *ctx,
3992 		     const struct tgsi_full_dst_register *tgsi_dst,
3993 		     unsigned swizzle,
3994 		     struct r600_bytecode_alu_dst *r600_dst)
3995 {
3996 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3997 
3998 	r600_dst->sel = tgsi_dst->Register.Index;
3999 	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4000 	r600_dst->chan = swizzle;
4001 	r600_dst->write = 1;
4002 	if (inst->Instruction.Saturate) {
4003 		r600_dst->clamp = 1;
4004 	}
4005 	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4006 		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4007 			return;
4008 		}
4009 	}
4010 	if (tgsi_dst->Register.Indirect)
4011 		r600_dst->rel = V_SQ_REL_RELATIVE;
4012 
4013 }
4014 
tgsi_op2_64_params(struct r600_shader_ctx * ctx,bool singledest,bool swap,int dest_temp,int op_override)4015 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4016 {
4017 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4018 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4019 	struct r600_bytecode_alu alu;
4020 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4021 	int use_tmp = 0;
4022 	int swizzle_x = inst->Src[0].Register.SwizzleX;
4023 
4024 	if (singledest) {
4025 		switch (write_mask) {
4026 		case 0x1:
4027 			if (swizzle_x == 2) {
4028 				write_mask = 0xc;
4029 				use_tmp = 3;
4030 			} else
4031 				write_mask = 0x3;
4032 			break;
4033 		case 0x2:
4034 			if (swizzle_x == 2) {
4035 				write_mask = 0xc;
4036 				use_tmp = 3;
4037 			} else {
4038 				write_mask = 0x3;
4039 				use_tmp = 1;
4040 			}
4041 			break;
4042 		case 0x4:
4043 			if (swizzle_x == 0) {
4044 				write_mask = 0x3;
4045 				use_tmp = 1;
4046 			} else
4047 				write_mask = 0xc;
4048 			break;
4049 		case 0x8:
4050 			if (swizzle_x == 0) {
4051 				write_mask = 0x3;
4052 				use_tmp = 1;
4053 			} else {
4054 				write_mask = 0xc;
4055 				use_tmp = 3;
4056 			}
4057 			break;
4058 		}
4059 	}
4060 
4061 	lasti = tgsi_last_instruction(write_mask);
4062 	for (i = 0; i <= lasti; i++) {
4063 
4064 		if (!(write_mask & (1 << i)))
4065 			continue;
4066 
4067 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4068 
4069 		if (singledest) {
4070 			if (use_tmp || dest_temp) {
4071 				alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4072 				alu.dst.chan = i;
4073 				alu.dst.write = 1;
4074 			} else {
4075 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4076 			}
4077 			if (i == 1 || i == 3)
4078 				alu.dst.write = 0;
4079 		} else
4080 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4081 
4082 		alu.op = op_override ? op_override : ctx->inst_info->op;
4083 		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4084 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4085 		} else if (!swap) {
4086 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4087 				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4088 			}
4089 		} else {
4090 			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4091 			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4092 		}
4093 
4094 		/* handle some special cases */
4095 		if (i == 1 || i == 3) {
4096 			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4097 			case TGSI_OPCODE_DABS:
4098 				r600_bytecode_src_set_abs(&alu.src[0]);
4099 				break;
4100 			default:
4101 				break;
4102 			}
4103 		}
4104 		if (i == lasti) {
4105 			alu.last = 1;
4106 		}
4107 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4108 		if (r)
4109 			return r;
4110 	}
4111 
4112 	if (use_tmp) {
4113 		write_mask = inst->Dst[0].Register.WriteMask;
4114 
4115 		lasti = tgsi_last_instruction(write_mask);
4116 		/* move result from temp to dst */
4117 		for (i = 0; i <= lasti; i++) {
4118 			if (!(write_mask & (1 << i)))
4119 				continue;
4120 
4121 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4122 			alu.op = ALU_OP1_MOV;
4123 
4124 			if (dest_temp) {
4125 				alu.dst.sel = dest_temp;
4126 				alu.dst.chan = i;
4127 				alu.dst.write = 1;
4128 			} else
4129 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4130 			alu.src[0].sel = ctx->temp_reg;
4131 			alu.src[0].chan = use_tmp - 1;
4132 			alu.last = (i == lasti);
4133 
4134 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4135 			if (r)
4136 				return r;
4137 		}
4138 	}
4139 	return 0;
4140 }
4141 
tgsi_op2_64(struct r600_shader_ctx * ctx)4142 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4143 {
4144 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4145 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4146 	/* confirm writemasking */
4147 	if ((write_mask & 0x3) != 0x3 &&
4148 	    (write_mask & 0xc) != 0xc) {
4149 		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4150 		return -1;
4151 	}
4152 	return tgsi_op2_64_params(ctx, false, false, 0, 0);
4153 }
4154 
tgsi_op2_64_single_dest(struct r600_shader_ctx * ctx)4155 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4156 {
4157 	return tgsi_op2_64_params(ctx, true, false, 0, 0);
4158 }
4159 
tgsi_op2_64_single_dest_s(struct r600_shader_ctx * ctx)4160 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4161 {
4162 	return tgsi_op2_64_params(ctx, true, true, 0, 0);
4163 }
4164 
tgsi_op3_64(struct r600_shader_ctx * ctx)4165 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4166 {
4167 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4168 	struct r600_bytecode_alu alu;
4169 	int i, j, r;
4170 	int lasti = 3;
4171 	int tmp = r600_get_temp(ctx);
4172 
4173 	for (i = 0; i < lasti + 1; i++) {
4174 
4175 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4176 		alu.op = ctx->inst_info->op;
4177 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4178 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4179 		}
4180 
4181 		if (inst->Dst[0].Register.WriteMask & (1 << i))
4182 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4183 		else
4184 			alu.dst.sel = tmp;
4185 
4186 		alu.dst.chan = i;
4187 		alu.is_op3 = 1;
4188 		if (i == lasti) {
4189 			alu.last = 1;
4190 		}
4191 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4192 		if (r)
4193 			return r;
4194 	}
4195 	return 0;
4196 }
4197 
tgsi_op2_s(struct r600_shader_ctx * ctx,int swap,int trans_only)4198 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4199 {
4200 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4201 	struct r600_bytecode_alu alu;
4202 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4203 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4204 	/* use temp register if trans_only and more than one dst component */
4205 	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4206 	unsigned op = ctx->inst_info->op;
4207 
4208 	if (op == ALU_OP2_MUL_IEEE &&
4209 	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4210 		op = ALU_OP2_MUL;
4211 
4212 	for (i = 0; i <= lasti; i++) {
4213 		if (!(write_mask & (1 << i)))
4214 			continue;
4215 
4216 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4217 		if (use_tmp) {
4218 			alu.dst.sel = ctx->temp_reg;
4219 			alu.dst.chan = i;
4220 			alu.dst.write = 1;
4221 		} else
4222 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4223 
4224 		alu.op = op;
4225 		if (!swap) {
4226 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4227 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4228 			}
4229 		} else {
4230 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4231 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4232 		}
4233 		if (i == lasti || trans_only) {
4234 			alu.last = 1;
4235 		}
4236 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4237 		if (r)
4238 			return r;
4239 	}
4240 
4241 	if (use_tmp) {
4242 		/* move result from temp to dst */
4243 		for (i = 0; i <= lasti; i++) {
4244 			if (!(write_mask & (1 << i)))
4245 				continue;
4246 
4247 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4248 			alu.op = ALU_OP1_MOV;
4249 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4250 			alu.src[0].sel = ctx->temp_reg;
4251 			alu.src[0].chan = i;
4252 			alu.last = (i == lasti);
4253 
4254 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4255 			if (r)
4256 				return r;
4257 		}
4258 	}
4259 	return 0;
4260 }
4261 
tgsi_op2(struct r600_shader_ctx * ctx)4262 static int tgsi_op2(struct r600_shader_ctx *ctx)
4263 {
4264 	return tgsi_op2_s(ctx, 0, 0);
4265 }
4266 
tgsi_op2_swap(struct r600_shader_ctx * ctx)4267 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4268 {
4269 	return tgsi_op2_s(ctx, 1, 0);
4270 }
4271 
tgsi_op2_trans(struct r600_shader_ctx * ctx)4272 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4273 {
4274 	return tgsi_op2_s(ctx, 0, 1);
4275 }
4276 
tgsi_ineg(struct r600_shader_ctx * ctx)4277 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4278 {
4279 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4280 	struct r600_bytecode_alu alu;
4281 	int i, r;
4282 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4283 
4284 	for (i = 0; i < lasti + 1; i++) {
4285 
4286 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4287 			continue;
4288 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4289 		alu.op = ctx->inst_info->op;
4290 
4291 		alu.src[0].sel = V_SQ_ALU_SRC_0;
4292 
4293 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4294 
4295 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4296 
4297 		if (i == lasti) {
4298 			alu.last = 1;
4299 		}
4300 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4301 		if (r)
4302 			return r;
4303 	}
4304 	return 0;
4305 
4306 }
4307 
tgsi_dneg(struct r600_shader_ctx * ctx)4308 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4309 {
4310 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4311 	struct r600_bytecode_alu alu;
4312 	int i, r;
4313 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4314 
4315 	for (i = 0; i < lasti + 1; i++) {
4316 
4317 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4318 			continue;
4319 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4320 		alu.op = ALU_OP1_MOV;
4321 
4322 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4323 
4324 		if (i == 1 || i == 3)
4325 			r600_bytecode_src_toggle_neg(&alu.src[0]);
4326 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4327 
4328 		if (i == lasti) {
4329 			alu.last = 1;
4330 		}
4331 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4332 		if (r)
4333 			return r;
4334 	}
4335 	return 0;
4336 
4337 }
4338 
tgsi_dfracexp(struct r600_shader_ctx * ctx)4339 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4340 {
4341 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4342 	struct r600_bytecode_alu alu;
4343 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4344 	int i, j, r;
4345 
4346 	for (i = 0; i <= 3; i++) {
4347 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4348 		alu.op = ctx->inst_info->op;
4349 
4350 		alu.dst.sel = ctx->temp_reg;
4351 		alu.dst.chan = i;
4352 		alu.dst.write = 1;
4353 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4354 			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4355 		}
4356 
4357 		if (i == 3)
4358 			alu.last = 1;
4359 
4360 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4361 		if (r)
4362 			return r;
4363 	}
4364 
4365 	/* Replicate significand result across channels. */
4366 	for (i = 0; i <= 3; i++) {
4367 		if (!(write_mask & (1 << i)))
4368 			continue;
4369 
4370 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4371 		alu.op = ALU_OP1_MOV;
4372 		alu.src[0].chan = (i & 1) + 2;
4373 		alu.src[0].sel = ctx->temp_reg;
4374 
4375 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4376 		alu.dst.write = 1;
4377 		alu.last = 1;
4378 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4379 		if (r)
4380 			return r;
4381 	}
4382 
4383 	for (i = 0; i <= 3; i++) {
4384 		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4385 			/* MOV third channels to writemask dst1 */
4386 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4387 			alu.op = ALU_OP1_MOV;
4388 			alu.src[0].chan = 1;
4389 			alu.src[0].sel = ctx->temp_reg;
4390 
4391 			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4392 			alu.last = 1;
4393 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4394 			if (r)
4395 				return r;
4396 			break;
4397 		}
4398 	}
4399 	return 0;
4400 }
4401 
4402 
egcm_int_to_double(struct r600_shader_ctx * ctx)4403 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4404 {
4405 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4406 	struct r600_bytecode_alu alu;
4407 	int i, r;
4408 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4409 
4410 	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4411 		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4412 
4413 	for (i = 0; i <= (lasti+1)/2; i++) {
4414 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4415 		alu.op = ctx->inst_info->op;
4416 
4417 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4418 		alu.dst.sel = ctx->temp_reg;
4419 		alu.dst.chan = i;
4420 		alu.dst.write = 1;
4421 		alu.last = 1;
4422 
4423 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4424 		if (r)
4425 			return r;
4426 	}
4427 
4428 	for (i = 0; i <= lasti; i++) {
4429 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4430 		alu.op = ALU_OP1_FLT32_TO_FLT64;
4431 
4432 		alu.src[0].chan = i/2;
4433 		if (i%2 == 0)
4434 			alu.src[0].sel = ctx->temp_reg;
4435 		else {
4436 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4437 			alu.src[0].value = 0x0;
4438 		}
4439 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4440 		alu.last = i == lasti;
4441 
4442 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4443 		if (r)
4444 			return r;
4445 	}
4446 
4447 	return 0;
4448 }
4449 
egcm_double_to_int(struct r600_shader_ctx * ctx)4450 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4451 {
4452 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4453 	struct r600_bytecode_alu alu;
4454 	int i, r;
4455 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4456 	int treg = r600_get_temp(ctx);
4457 	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4458 		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4459 
4460 	/* do a 64->32 into a temp register */
4461 	r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4462 	if (r)
4463 		return r;
4464 
4465 	for (i = 0; i <= lasti; i++) {
4466 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4467 			continue;
4468 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4469 		alu.op = ctx->inst_info->op;
4470 
4471 		alu.src[0].chan = i;
4472 		alu.src[0].sel = treg;
4473 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4474 		alu.last = (i == lasti);
4475 
4476 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4477 		if (r)
4478 			return r;
4479 	}
4480 
4481 	return 0;
4482 }
4483 
cayman_emit_unary_double_raw(struct r600_bytecode * bc,unsigned op,int dst_reg,struct r600_shader_src * src,bool abs)4484 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4485 					unsigned op,
4486 					int dst_reg,
4487 					struct r600_shader_src *src,
4488 					bool abs)
4489 {
4490 	struct r600_bytecode_alu alu;
4491 	const int last_slot = 3;
4492 	int r;
4493 
4494 	/* these have to write the result to X/Y by the looks of it */
4495 	for (int i = 0 ; i < last_slot; i++) {
4496 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4497 		alu.op = op;
4498 
4499 		r600_bytecode_src(&alu.src[0], src, 1);
4500 		r600_bytecode_src(&alu.src[1], src, 0);
4501 
4502 		if (abs)
4503 			r600_bytecode_src_set_abs(&alu.src[1]);
4504 
4505 		alu.dst.sel = dst_reg;
4506 		alu.dst.chan = i;
4507 		alu.dst.write = (i == 0 || i == 1);
4508 
4509 		if (bc->chip_class != CAYMAN || i == last_slot - 1)
4510 			alu.last = 1;
4511 		r = r600_bytecode_add_alu(bc, &alu);
4512 		if (r)
4513 			return r;
4514 	}
4515 
4516 	return 0;
4517 }
4518 
cayman_emit_double_instr(struct r600_shader_ctx * ctx)4519 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4520 {
4521 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4522 	int i, r;
4523 	struct r600_bytecode_alu alu;
4524 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4525 	int t1 = ctx->temp_reg;
4526 
4527 	/* should only be one src regs */
4528 	assert(inst->Instruction.NumSrcRegs == 1);
4529 
4530 	/* only support one double at a time */
4531 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4532 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4533 
4534 	r = cayman_emit_unary_double_raw(
4535 		ctx->bc, ctx->inst_info->op, t1,
4536 		&ctx->src[0],
4537 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4538 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4539 	if (r)
4540 		return r;
4541 
4542 	for (i = 0 ; i <= lasti; i++) {
4543 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4544 			continue;
4545 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4546 		alu.op = ALU_OP1_MOV;
4547 		alu.src[0].sel = t1;
4548 		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4549 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4550 		alu.dst.write = 1;
4551 		if (i == lasti)
4552 			alu.last = 1;
4553 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4554 		if (r)
4555 			return r;
4556 	}
4557 	return 0;
4558 }
4559 
cayman_emit_float_instr(struct r600_shader_ctx * ctx)4560 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4561 {
4562 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4563 	int i, j, r;
4564 	struct r600_bytecode_alu alu;
4565 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4566 
4567 	for (i = 0 ; i < last_slot; i++) {
4568 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4569 		alu.op = ctx->inst_info->op;
4570 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4571 			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4572 
4573 			/* RSQ should take the absolute value of src */
4574 			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4575 				r600_bytecode_src_set_abs(&alu.src[j]);
4576 			}
4577 		}
4578 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4579 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4580 
4581 		if (i == last_slot - 1)
4582 			alu.last = 1;
4583 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4584 		if (r)
4585 			return r;
4586 	}
4587 	return 0;
4588 }
4589 
cayman_mul_int_instr(struct r600_shader_ctx * ctx)4590 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4591 {
4592 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4593 	int i, j, k, r;
4594 	struct r600_bytecode_alu alu;
4595 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4596 	int t1 = ctx->temp_reg;
4597 
4598 	for (k = 0; k <= lasti; k++) {
4599 		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4600 			continue;
4601 
4602 		for (i = 0 ; i < 4; i++) {
4603 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4604 			alu.op = ctx->inst_info->op;
4605 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4606 				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4607 			}
4608 			alu.dst.sel = t1;
4609 			alu.dst.chan = i;
4610 			alu.dst.write = (i == k);
4611 			if (i == 3)
4612 				alu.last = 1;
4613 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4614 			if (r)
4615 				return r;
4616 		}
4617 	}
4618 
4619 	for (i = 0 ; i <= lasti; i++) {
4620 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4621 			continue;
4622 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4623 		alu.op = ALU_OP1_MOV;
4624 		alu.src[0].sel = t1;
4625 		alu.src[0].chan = i;
4626 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4627 		alu.dst.write = 1;
4628 		if (i == lasti)
4629 			alu.last = 1;
4630 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4631 		if (r)
4632 			return r;
4633 	}
4634 
4635 	return 0;
4636 }
4637 
4638 
cayman_mul_double_instr(struct r600_shader_ctx * ctx)4639 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4640 {
4641 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4642 	int i, j, k, r;
4643 	struct r600_bytecode_alu alu;
4644 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4645 	int t1 = ctx->temp_reg;
4646 
4647 	/* t1 would get overwritten below if we actually tried to
4648 	 * multiply two pairs of doubles at a time. */
4649 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4650 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4651 
4652 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4653 
4654 	for (i = 0; i < 4; i++) {
4655 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4656 		alu.op = ctx->inst_info->op;
4657 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4658 			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4659 		}
4660 		alu.dst.sel = t1;
4661 		alu.dst.chan = i;
4662 		alu.dst.write = 1;
4663 		if (i == 3)
4664 			alu.last = 1;
4665 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4666 		if (r)
4667 			return r;
4668 	}
4669 
4670 	for (i = 0; i <= lasti; i++) {
4671 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4672 			continue;
4673 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4674 		alu.op = ALU_OP1_MOV;
4675 		alu.src[0].sel = t1;
4676 		alu.src[0].chan = i;
4677 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4678 		alu.dst.write = 1;
4679 		if (i == lasti)
4680 			alu.last = 1;
4681 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4682 		if (r)
4683 			return r;
4684 	}
4685 
4686 	return 0;
4687 }
4688 
4689 /*
4690  * Emit RECIP_64 + MUL_64 to implement division.
4691  */
cayman_ddiv_instr(struct r600_shader_ctx * ctx)4692 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4693 {
4694 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4695 	int r;
4696 	struct r600_bytecode_alu alu;
4697 	int t1 = ctx->temp_reg;
4698 	int k;
4699 
4700 	/* Only support one double at a time. This is the same constraint as
4701 	 * in DMUL lowering. */
4702 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4703 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4704 
4705 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4706 
4707 	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4708 	if (r)
4709 		return r;
4710 
4711 	for (int i = 0; i < 4; i++) {
4712 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4713 		alu.op = ALU_OP2_MUL_64;
4714 
4715 		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4716 
4717 		alu.src[1].sel = t1;
4718 		alu.src[1].chan = (i == 3) ? 0 : 1;
4719 
4720 		alu.dst.sel = t1;
4721 		alu.dst.chan = i;
4722 		alu.dst.write = 1;
4723 		if (i == 3)
4724 			alu.last = 1;
4725 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4726 		if (r)
4727 			return r;
4728 	}
4729 
4730 	for (int i = 0; i < 2; i++) {
4731 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4732 		alu.op = ALU_OP1_MOV;
4733 		alu.src[0].sel = t1;
4734 		alu.src[0].chan = i;
4735 		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4736 		alu.dst.write = 1;
4737 		if (i == 1)
4738 			alu.last = 1;
4739 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4740 		if (r)
4741 			return r;
4742 	}
4743 	return 0;
4744 }
4745 
4746 /*
4747  * r600 - trunc to -PI..PI range
4748  * r700 - normalize by dividing by 2PI
4749  * see fdo bug 27901
4750  */
tgsi_setup_trig(struct r600_shader_ctx * ctx)4751 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4752 {
4753 	int r;
4754 	struct r600_bytecode_alu alu;
4755 
4756 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4757 	alu.op = ALU_OP3_MULADD;
4758 	alu.is_op3 = 1;
4759 
4760 	alu.dst.chan = 0;
4761 	alu.dst.sel = ctx->temp_reg;
4762 	alu.dst.write = 1;
4763 
4764 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4765 
4766 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4767 	alu.src[1].chan = 0;
4768 	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4769 	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4770 	alu.src[2].chan = 0;
4771 	alu.last = 1;
4772 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4773 	if (r)
4774 		return r;
4775 
4776 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4777 	alu.op = ALU_OP1_FRACT;
4778 
4779 	alu.dst.chan = 0;
4780 	alu.dst.sel = ctx->temp_reg;
4781 	alu.dst.write = 1;
4782 
4783 	alu.src[0].sel = ctx->temp_reg;
4784 	alu.src[0].chan = 0;
4785 	alu.last = 1;
4786 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4787 	if (r)
4788 		return r;
4789 
4790 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4791 	alu.op = ALU_OP3_MULADD;
4792 	alu.is_op3 = 1;
4793 
4794 	alu.dst.chan = 0;
4795 	alu.dst.sel = ctx->temp_reg;
4796 	alu.dst.write = 1;
4797 
4798 	alu.src[0].sel = ctx->temp_reg;
4799 	alu.src[0].chan = 0;
4800 
4801 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4802 	alu.src[1].chan = 0;
4803 	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4804 	alu.src[2].chan = 0;
4805 
4806 	if (ctx->bc->chip_class == R600) {
4807 		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4808 		alu.src[2].value = u_bitcast_f2u(-M_PI);
4809 	} else {
4810 		alu.src[1].sel = V_SQ_ALU_SRC_1;
4811 		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4812 		alu.src[2].neg = 1;
4813 	}
4814 
4815 	alu.last = 1;
4816 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4817 	if (r)
4818 		return r;
4819 	return 0;
4820 }
4821 
cayman_trig(struct r600_shader_ctx * ctx)4822 static int cayman_trig(struct r600_shader_ctx *ctx)
4823 {
4824 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4825 	struct r600_bytecode_alu alu;
4826 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4827 	int i, r;
4828 
4829 	r = tgsi_setup_trig(ctx);
4830 	if (r)
4831 		return r;
4832 
4833 
4834 	for (i = 0; i < last_slot; i++) {
4835 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4836 		alu.op = ctx->inst_info->op;
4837 		alu.dst.chan = i;
4838 
4839 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4840 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4841 
4842 		alu.src[0].sel = ctx->temp_reg;
4843 		alu.src[0].chan = 0;
4844 		if (i == last_slot - 1)
4845 			alu.last = 1;
4846 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4847 		if (r)
4848 			return r;
4849 	}
4850 	return 0;
4851 }
4852 
tgsi_trig(struct r600_shader_ctx * ctx)4853 static int tgsi_trig(struct r600_shader_ctx *ctx)
4854 {
4855 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4856 	struct r600_bytecode_alu alu;
4857 	int i, r;
4858 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4859 
4860 	r = tgsi_setup_trig(ctx);
4861 	if (r)
4862 		return r;
4863 
4864 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4865 	alu.op = ctx->inst_info->op;
4866 	alu.dst.chan = 0;
4867 	alu.dst.sel = ctx->temp_reg;
4868 	alu.dst.write = 1;
4869 
4870 	alu.src[0].sel = ctx->temp_reg;
4871 	alu.src[0].chan = 0;
4872 	alu.last = 1;
4873 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4874 	if (r)
4875 		return r;
4876 
4877 	/* replicate result */
4878 	for (i = 0; i < lasti + 1; i++) {
4879 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4880 			continue;
4881 
4882 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4883 		alu.op = ALU_OP1_MOV;
4884 
4885 		alu.src[0].sel = ctx->temp_reg;
4886 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4887 		if (i == lasti)
4888 			alu.last = 1;
4889 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4890 		if (r)
4891 			return r;
4892 	}
4893 	return 0;
4894 }
4895 
tgsi_kill(struct r600_shader_ctx * ctx)4896 static int tgsi_kill(struct r600_shader_ctx *ctx)
4897 {
4898 	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4899 	struct r600_bytecode_alu alu;
4900 	int i, r;
4901 
4902 	for (i = 0; i < 4; i++) {
4903 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4904 		alu.op = ctx->inst_info->op;
4905 
4906 		alu.dst.chan = i;
4907 
4908 		alu.src[0].sel = V_SQ_ALU_SRC_0;
4909 
4910 		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4911 			alu.src[1].sel = V_SQ_ALU_SRC_1;
4912 			alu.src[1].neg = 1;
4913 		} else {
4914 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4915 		}
4916 		if (i == 3) {
4917 			alu.last = 1;
4918 		}
4919 		r = r600_bytecode_add_alu(ctx->bc, &alu);
4920 		if (r)
4921 			return r;
4922 	}
4923 
4924 	/* kill must be last in ALU */
4925 	ctx->bc->force_add_cf = 1;
4926 	ctx->shader->uses_kill = TRUE;
4927 	return 0;
4928 }
4929 
tgsi_lit(struct r600_shader_ctx * ctx)4930 static int tgsi_lit(struct r600_shader_ctx *ctx)
4931 {
4932 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4933 	struct r600_bytecode_alu alu;
4934 	int r;
4935 
4936 	/* tmp.x = max(src.y, 0.0) */
4937 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4938 	alu.op = ALU_OP2_MAX;
4939 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4940 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4941 	alu.src[1].chan = 1;
4942 
4943 	alu.dst.sel = ctx->temp_reg;
4944 	alu.dst.chan = 0;
4945 	alu.dst.write = 1;
4946 
4947 	alu.last = 1;
4948 	r = r600_bytecode_add_alu(ctx->bc, &alu);
4949 	if (r)
4950 		return r;
4951 
4952 	if (inst->Dst[0].Register.WriteMask & (1 << 2))
4953 	{
4954 		int chan;
4955 		int sel;
4956 		unsigned i;
4957 
4958 		if (ctx->bc->chip_class == CAYMAN) {
4959 			for (i = 0; i < 3; i++) {
4960 				/* tmp.z = log(tmp.x) */
4961 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4962 				alu.op = ALU_OP1_LOG_CLAMPED;
4963 				alu.src[0].sel = ctx->temp_reg;
4964 				alu.src[0].chan = 0;
4965 				alu.dst.sel = ctx->temp_reg;
4966 				alu.dst.chan = i;
4967 				if (i == 2) {
4968 					alu.dst.write = 1;
4969 					alu.last = 1;
4970 				} else
4971 					alu.dst.write = 0;
4972 
4973 				r = r600_bytecode_add_alu(ctx->bc, &alu);
4974 				if (r)
4975 					return r;
4976 			}
4977 		} else {
4978 			/* tmp.z = log(tmp.x) */
4979 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4980 			alu.op = ALU_OP1_LOG_CLAMPED;
4981 			alu.src[0].sel = ctx->temp_reg;
4982 			alu.src[0].chan = 0;
4983 			alu.dst.sel = ctx->temp_reg;
4984 			alu.dst.chan = 2;
4985 			alu.dst.write = 1;
4986 			alu.last = 1;
4987 			r = r600_bytecode_add_alu(ctx->bc, &alu);
4988 			if (r)
4989 				return r;
4990 		}
4991 
4992 		chan = alu.dst.chan;
4993 		sel = alu.dst.sel;
4994 
4995 		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4996 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4997 		alu.op = ALU_OP3_MUL_LIT;
4998 		alu.src[0].sel  = sel;
4999 		alu.src[0].chan = chan;
5000 		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5001 		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5002 		alu.dst.sel = ctx->temp_reg;
5003 		alu.dst.chan = 0;
5004 		alu.dst.write = 1;
5005 		alu.is_op3 = 1;
5006 		alu.last = 1;
5007 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5008 		if (r)
5009 			return r;
5010 
5011 		if (ctx->bc->chip_class == CAYMAN) {
5012 			for (i = 0; i < 3; i++) {
5013 				/* dst.z = exp(tmp.x) */
5014 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5015 				alu.op = ALU_OP1_EXP_IEEE;
5016 				alu.src[0].sel = ctx->temp_reg;
5017 				alu.src[0].chan = 0;
5018 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5019 				if (i == 2) {
5020 					alu.dst.write = 1;
5021 					alu.last = 1;
5022 				} else
5023 					alu.dst.write = 0;
5024 				r = r600_bytecode_add_alu(ctx->bc, &alu);
5025 				if (r)
5026 					return r;
5027 			}
5028 		} else {
5029 			/* dst.z = exp(tmp.x) */
5030 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5031 			alu.op = ALU_OP1_EXP_IEEE;
5032 			alu.src[0].sel = ctx->temp_reg;
5033 			alu.src[0].chan = 0;
5034 			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5035 			alu.last = 1;
5036 			r = r600_bytecode_add_alu(ctx->bc, &alu);
5037 			if (r)
5038 				return r;
5039 		}
5040 	}
5041 
5042 	/* dst.x, <- 1.0  */
5043 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5044 	alu.op = ALU_OP1_MOV;
5045 	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
5046 	alu.src[0].chan = 0;
5047 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5048 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5049 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5050 	if (r)
5051 		return r;
5052 
5053 	/* dst.y = max(src.x, 0.0) */
5054 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5055 	alu.op = ALU_OP2_MAX;
5056 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5057 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5058 	alu.src[1].chan = 0;
5059 	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5060 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5061 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5062 	if (r)
5063 		return r;
5064 
5065 	/* dst.w, <- 1.0  */
5066 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5067 	alu.op = ALU_OP1_MOV;
5068 	alu.src[0].sel  = V_SQ_ALU_SRC_1;
5069 	alu.src[0].chan = 0;
5070 	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5071 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5072 	alu.last = 1;
5073 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5074 	if (r)
5075 		return r;
5076 
5077 	return 0;
5078 }
5079 
tgsi_rsq(struct r600_shader_ctx * ctx)5080 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5081 {
5082 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5083 	struct r600_bytecode_alu alu;
5084 	int i, r;
5085 
5086 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5087 
5088 	alu.op = ALU_OP1_RECIPSQRT_IEEE;
5089 
5090 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5091 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5092 		r600_bytecode_src_set_abs(&alu.src[i]);
5093 	}
5094 	alu.dst.sel = ctx->temp_reg;
5095 	alu.dst.write = 1;
5096 	alu.last = 1;
5097 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5098 	if (r)
5099 		return r;
5100 	/* replicate result */
5101 	return tgsi_helper_tempx_replicate(ctx);
5102 }
5103 
tgsi_helper_tempx_replicate(struct r600_shader_ctx * ctx)5104 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5105 {
5106 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5107 	struct r600_bytecode_alu alu;
5108 	int i, r;
5109 
5110 	for (i = 0; i < 4; i++) {
5111 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5112 		alu.src[0].sel = ctx->temp_reg;
5113 		alu.op = ALU_OP1_MOV;
5114 		alu.dst.chan = i;
5115 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5116 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5117 		if (i == 3)
5118 			alu.last = 1;
5119 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5120 		if (r)
5121 			return r;
5122 	}
5123 	return 0;
5124 }
5125 
tgsi_trans_srcx_replicate(struct r600_shader_ctx * ctx)5126 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5127 {
5128 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5129 	struct r600_bytecode_alu alu;
5130 	int i, r;
5131 
5132 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5133 	alu.op = ctx->inst_info->op;
5134 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5135 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5136 	}
5137 	alu.dst.sel = ctx->temp_reg;
5138 	alu.dst.write = 1;
5139 	alu.last = 1;
5140 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5141 	if (r)
5142 		return r;
5143 	/* replicate result */
5144 	return tgsi_helper_tempx_replicate(ctx);
5145 }
5146 
cayman_pow(struct r600_shader_ctx * ctx)5147 static int cayman_pow(struct r600_shader_ctx *ctx)
5148 {
5149 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5150 	int i, r;
5151 	struct r600_bytecode_alu alu;
5152 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5153 
5154 	for (i = 0; i < 3; i++) {
5155 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5156 		alu.op = ALU_OP1_LOG_IEEE;
5157 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5158 		alu.dst.sel = ctx->temp_reg;
5159 		alu.dst.chan = i;
5160 		alu.dst.write = 1;
5161 		if (i == 2)
5162 			alu.last = 1;
5163 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5164 		if (r)
5165 			return r;
5166 	}
5167 
5168 	/* b * LOG2(a) */
5169 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5170 	alu.op = ALU_OP2_MUL;
5171 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5172 	alu.src[1].sel = ctx->temp_reg;
5173 	alu.dst.sel = ctx->temp_reg;
5174 	alu.dst.write = 1;
5175 	alu.last = 1;
5176 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5177 	if (r)
5178 		return r;
5179 
5180 	for (i = 0; i < last_slot; i++) {
5181 		/* POW(a,b) = EXP2(b * LOG2(a))*/
5182 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183 		alu.op = ALU_OP1_EXP_IEEE;
5184 		alu.src[0].sel = ctx->temp_reg;
5185 
5186 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5187 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5188 		if (i == last_slot - 1)
5189 			alu.last = 1;
5190 		r = r600_bytecode_add_alu(ctx->bc, &alu);
5191 		if (r)
5192 			return r;
5193 	}
5194 	return 0;
5195 }
5196 
tgsi_pow(struct r600_shader_ctx * ctx)5197 static int tgsi_pow(struct r600_shader_ctx *ctx)
5198 {
5199 	struct r600_bytecode_alu alu;
5200 	int r;
5201 
5202 	/* LOG2(a) */
5203 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5204 	alu.op = ALU_OP1_LOG_IEEE;
5205 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5206 	alu.dst.sel = ctx->temp_reg;
5207 	alu.dst.write = 1;
5208 	alu.last = 1;
5209 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5210 	if (r)
5211 		return r;
5212 	/* b * LOG2(a) */
5213 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5214 	alu.op = ALU_OP2_MUL;
5215 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5216 	alu.src[1].sel = ctx->temp_reg;
5217 	alu.dst.sel = ctx->temp_reg;
5218 	alu.dst.write = 1;
5219 	alu.last = 1;
5220 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5221 	if (r)
5222 		return r;
5223 	/* POW(a,b) = EXP2(b * LOG2(a))*/
5224 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5225 	alu.op = ALU_OP1_EXP_IEEE;
5226 	alu.src[0].sel = ctx->temp_reg;
5227 	alu.dst.sel = ctx->temp_reg;
5228 	alu.dst.write = 1;
5229 	alu.last = 1;
5230 	r = r600_bytecode_add_alu(ctx->bc, &alu);
5231 	if (r)
5232 		return r;
5233 	return tgsi_helper_tempx_replicate(ctx);
5234 }
5235 
tgsi_divmod(struct r600_shader_ctx * ctx,int mod,int signed_op)5236 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5237 {
5238 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5239 	struct r600_bytecode_alu alu;
5240 	int i, r, j;
5241 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5242 	int tmp0 = ctx->temp_reg;
5243 	int tmp1 = r600_get_temp(ctx);
5244 	int tmp2 = r600_get_temp(ctx);
5245 	int tmp3 = r600_get_temp(ctx);
5246 	/* Unsigned path:
5247 	 *
5248 	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5249 	 *
5250 	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5251 	 * 2. tmp0.z = lo (tmp0.x * src2)
5252 	 * 3. tmp0.w = -tmp0.z
5253 	 * 4. tmp0.y = hi (tmp0.x * src2)
5254 	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5255 	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5256 	 * 7. tmp1.x = tmp0.x - tmp0.w
5257 	 * 8. tmp1.y = tmp0.x + tmp0.w
5258 	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5259 	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5260 	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5261 	 *
5262 	 * 12. tmp0.w = src1 - tmp0.y       = r
5263 	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5264 	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5265 	 *
5266 	 * if DIV
5267 	 *
5268 	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5269 	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5270 	 *
5271 	 * else MOD
5272 	 *
5273 	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5274 	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5275 	 *
5276 	 * endif
5277 	 *
5278 	 * 17. tmp1.x = tmp1.x & tmp1.y
5279 	 *
5280 	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5281 	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5282 	 *
5283 	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5284 	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5285 	 *
5286 	 * Signed path:
5287 	 *
5288 	 * Same as unsigned, using abs values of the operands,
5289 	 * and fixing the sign of the result in the end.
5290 	 */
5291 
5292 	for (i = 0; i < 4; i++) {
5293 		if (!(write_mask & (1<<i)))
5294 			continue;
5295 
5296 		if (signed_op) {
5297 
5298 			/* tmp2.x = -src0 */
5299 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5300 			alu.op = ALU_OP2_SUB_INT;
5301 
5302 			alu.dst.sel = tmp2;
5303 			alu.dst.chan = 0;
5304 			alu.dst.write = 1;
5305 
5306 			alu.src[0].sel = V_SQ_ALU_SRC_0;
5307 
5308 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5309 
5310 			alu.last = 1;
5311 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5312 				return r;
5313 
5314 			/* tmp2.y = -src1 */
5315 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5316 			alu.op = ALU_OP2_SUB_INT;
5317 
5318 			alu.dst.sel = tmp2;
5319 			alu.dst.chan = 1;
5320 			alu.dst.write = 1;
5321 
5322 			alu.src[0].sel = V_SQ_ALU_SRC_0;
5323 
5324 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5325 
5326 			alu.last = 1;
5327 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5328 				return r;
5329 
5330 			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5331 			/* it will be a sign of the quotient */
5332 			if (!mod) {
5333 
5334 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5335 				alu.op = ALU_OP2_XOR_INT;
5336 
5337 				alu.dst.sel = tmp2;
5338 				alu.dst.chan = 2;
5339 				alu.dst.write = 1;
5340 
5341 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5342 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5343 
5344 				alu.last = 1;
5345 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5346 					return r;
5347 			}
5348 
5349 			/* tmp2.x = |src0| */
5350 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5351 			alu.op = ALU_OP3_CNDGE_INT;
5352 			alu.is_op3 = 1;
5353 
5354 			alu.dst.sel = tmp2;
5355 			alu.dst.chan = 0;
5356 			alu.dst.write = 1;
5357 
5358 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5359 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5360 			alu.src[2].sel = tmp2;
5361 			alu.src[2].chan = 0;
5362 
5363 			alu.last = 1;
5364 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5365 				return r;
5366 
5367 			/* tmp2.y = |src1| */
5368 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5369 			alu.op = ALU_OP3_CNDGE_INT;
5370 			alu.is_op3 = 1;
5371 
5372 			alu.dst.sel = tmp2;
5373 			alu.dst.chan = 1;
5374 			alu.dst.write = 1;
5375 
5376 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5377 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5378 			alu.src[2].sel = tmp2;
5379 			alu.src[2].chan = 1;
5380 
5381 			alu.last = 1;
5382 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5383 				return r;
5384 
5385 		}
5386 
5387 		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5388 		if (ctx->bc->chip_class == CAYMAN) {
5389 			/* tmp3.x = u2f(src2) */
5390 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5391 			alu.op = ALU_OP1_UINT_TO_FLT;
5392 
5393 			alu.dst.sel = tmp3;
5394 			alu.dst.chan = 0;
5395 			alu.dst.write = 1;
5396 
5397 			if (signed_op) {
5398 				alu.src[0].sel = tmp2;
5399 				alu.src[0].chan = 1;
5400 			} else {
5401 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5402 			}
5403 
5404 			alu.last = 1;
5405 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5406 				return r;
5407 
5408 			/* tmp0.x = recip(tmp3.x) */
5409 			for (j = 0 ; j < 3; j++) {
5410 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5411 				alu.op = ALU_OP1_RECIP_IEEE;
5412 
5413 				alu.dst.sel = tmp0;
5414 				alu.dst.chan = j;
5415 				alu.dst.write = (j == 0);
5416 
5417 				alu.src[0].sel = tmp3;
5418 				alu.src[0].chan = 0;
5419 
5420 				if (j == 2)
5421 					alu.last = 1;
5422 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5423 					return r;
5424 			}
5425 
5426 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5427 			alu.op = ALU_OP2_MUL;
5428 
5429 			alu.src[0].sel = tmp0;
5430 			alu.src[0].chan = 0;
5431 
5432 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5433 			alu.src[1].value = 0x4f800000;
5434 
5435 			alu.dst.sel = tmp3;
5436 			alu.dst.write = 1;
5437 			alu.last = 1;
5438 			r = r600_bytecode_add_alu(ctx->bc, &alu);
5439 			if (r)
5440 				return r;
5441 
5442 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5443 			alu.op = ALU_OP1_FLT_TO_UINT;
5444 
5445 			alu.dst.sel = tmp0;
5446 			alu.dst.chan = 0;
5447 			alu.dst.write = 1;
5448 
5449 			alu.src[0].sel = tmp3;
5450 			alu.src[0].chan = 0;
5451 
5452 			alu.last = 1;
5453 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5454 				return r;
5455 
5456 		} else {
5457 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5458 			alu.op = ALU_OP1_RECIP_UINT;
5459 
5460 			alu.dst.sel = tmp0;
5461 			alu.dst.chan = 0;
5462 			alu.dst.write = 1;
5463 
5464 			if (signed_op) {
5465 				alu.src[0].sel = tmp2;
5466 				alu.src[0].chan = 1;
5467 			} else {
5468 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5469 			}
5470 
5471 			alu.last = 1;
5472 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5473 				return r;
5474 		}
5475 
5476 		/* 2. tmp0.z = lo (tmp0.x * src2) */
5477 		if (ctx->bc->chip_class == CAYMAN) {
5478 			for (j = 0 ; j < 4; j++) {
5479 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5480 				alu.op = ALU_OP2_MULLO_UINT;
5481 
5482 				alu.dst.sel = tmp0;
5483 				alu.dst.chan = j;
5484 				alu.dst.write = (j == 2);
5485 
5486 				alu.src[0].sel = tmp0;
5487 				alu.src[0].chan = 0;
5488 				if (signed_op) {
5489 					alu.src[1].sel = tmp2;
5490 					alu.src[1].chan = 1;
5491 				} else {
5492 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5493 				}
5494 
5495 				alu.last = (j == 3);
5496 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5497 					return r;
5498 			}
5499 		} else {
5500 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5501 			alu.op = ALU_OP2_MULLO_UINT;
5502 
5503 			alu.dst.sel = tmp0;
5504 			alu.dst.chan = 2;
5505 			alu.dst.write = 1;
5506 
5507 			alu.src[0].sel = tmp0;
5508 			alu.src[0].chan = 0;
5509 			if (signed_op) {
5510 				alu.src[1].sel = tmp2;
5511 				alu.src[1].chan = 1;
5512 			} else {
5513 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5514 			}
5515 
5516 			alu.last = 1;
5517 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5518 				return r;
5519 		}
5520 
5521 		/* 3. tmp0.w = -tmp0.z */
5522 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5523 		alu.op = ALU_OP2_SUB_INT;
5524 
5525 		alu.dst.sel = tmp0;
5526 		alu.dst.chan = 3;
5527 		alu.dst.write = 1;
5528 
5529 		alu.src[0].sel = V_SQ_ALU_SRC_0;
5530 		alu.src[1].sel = tmp0;
5531 		alu.src[1].chan = 2;
5532 
5533 		alu.last = 1;
5534 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5535 			return r;
5536 
5537 		/* 4. tmp0.y = hi (tmp0.x * src2) */
5538 		if (ctx->bc->chip_class == CAYMAN) {
5539 			for (j = 0 ; j < 4; j++) {
5540 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5541 				alu.op = ALU_OP2_MULHI_UINT;
5542 
5543 				alu.dst.sel = tmp0;
5544 				alu.dst.chan = j;
5545 				alu.dst.write = (j == 1);
5546 
5547 				alu.src[0].sel = tmp0;
5548 				alu.src[0].chan = 0;
5549 
5550 				if (signed_op) {
5551 					alu.src[1].sel = tmp2;
5552 					alu.src[1].chan = 1;
5553 				} else {
5554 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5555 				}
5556 				alu.last = (j == 3);
5557 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5558 					return r;
5559 			}
5560 		} else {
5561 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5562 			alu.op = ALU_OP2_MULHI_UINT;
5563 
5564 			alu.dst.sel = tmp0;
5565 			alu.dst.chan = 1;
5566 			alu.dst.write = 1;
5567 
5568 			alu.src[0].sel = tmp0;
5569 			alu.src[0].chan = 0;
5570 
5571 			if (signed_op) {
5572 				alu.src[1].sel = tmp2;
5573 				alu.src[1].chan = 1;
5574 			} else {
5575 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5576 			}
5577 
5578 			alu.last = 1;
5579 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5580 				return r;
5581 		}
5582 
5583 		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
5584 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5585 		alu.op = ALU_OP3_CNDE_INT;
5586 		alu.is_op3 = 1;
5587 
5588 		alu.dst.sel = tmp0;
5589 		alu.dst.chan = 2;
5590 		alu.dst.write = 1;
5591 
5592 		alu.src[0].sel = tmp0;
5593 		alu.src[0].chan = 1;
5594 		alu.src[1].sel = tmp0;
5595 		alu.src[1].chan = 3;
5596 		alu.src[2].sel = tmp0;
5597 		alu.src[2].chan = 2;
5598 
5599 		alu.last = 1;
5600 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5601 			return r;
5602 
5603 		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
5604 		if (ctx->bc->chip_class == CAYMAN) {
5605 			for (j = 0 ; j < 4; j++) {
5606 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5607 				alu.op = ALU_OP2_MULHI_UINT;
5608 
5609 				alu.dst.sel = tmp0;
5610 				alu.dst.chan = j;
5611 				alu.dst.write = (j == 3);
5612 
5613 				alu.src[0].sel = tmp0;
5614 				alu.src[0].chan = 2;
5615 
5616 				alu.src[1].sel = tmp0;
5617 				alu.src[1].chan = 0;
5618 
5619 				alu.last = (j == 3);
5620 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5621 					return r;
5622 			}
5623 		} else {
5624 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5625 			alu.op = ALU_OP2_MULHI_UINT;
5626 
5627 			alu.dst.sel = tmp0;
5628 			alu.dst.chan = 3;
5629 			alu.dst.write = 1;
5630 
5631 			alu.src[0].sel = tmp0;
5632 			alu.src[0].chan = 2;
5633 
5634 			alu.src[1].sel = tmp0;
5635 			alu.src[1].chan = 0;
5636 
5637 			alu.last = 1;
5638 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5639 				return r;
5640 		}
5641 
5642 		/* 7. tmp1.x = tmp0.x - tmp0.w */
5643 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5644 		alu.op = ALU_OP2_SUB_INT;
5645 
5646 		alu.dst.sel = tmp1;
5647 		alu.dst.chan = 0;
5648 		alu.dst.write = 1;
5649 
5650 		alu.src[0].sel = tmp0;
5651 		alu.src[0].chan = 0;
5652 		alu.src[1].sel = tmp0;
5653 		alu.src[1].chan = 3;
5654 
5655 		alu.last = 1;
5656 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5657 			return r;
5658 
5659 		/* 8. tmp1.y = tmp0.x + tmp0.w */
5660 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5661 		alu.op = ALU_OP2_ADD_INT;
5662 
5663 		alu.dst.sel = tmp1;
5664 		alu.dst.chan = 1;
5665 		alu.dst.write = 1;
5666 
5667 		alu.src[0].sel = tmp0;
5668 		alu.src[0].chan = 0;
5669 		alu.src[1].sel = tmp0;
5670 		alu.src[1].chan = 3;
5671 
5672 		alu.last = 1;
5673 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5674 			return r;
5675 
5676 		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5677 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5678 		alu.op = ALU_OP3_CNDE_INT;
5679 		alu.is_op3 = 1;
5680 
5681 		alu.dst.sel = tmp0;
5682 		alu.dst.chan = 0;
5683 		alu.dst.write = 1;
5684 
5685 		alu.src[0].sel = tmp0;
5686 		alu.src[0].chan = 1;
5687 		alu.src[1].sel = tmp1;
5688 		alu.src[1].chan = 1;
5689 		alu.src[2].sel = tmp1;
5690 		alu.src[2].chan = 0;
5691 
5692 		alu.last = 1;
5693 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5694 			return r;
5695 
5696 		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
5697 		if (ctx->bc->chip_class == CAYMAN) {
5698 			for (j = 0 ; j < 4; j++) {
5699 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5700 				alu.op = ALU_OP2_MULHI_UINT;
5701 
5702 				alu.dst.sel = tmp0;
5703 				alu.dst.chan = j;
5704 				alu.dst.write = (j == 2);
5705 
5706 				alu.src[0].sel = tmp0;
5707 				alu.src[0].chan = 0;
5708 
5709 				if (signed_op) {
5710 					alu.src[1].sel = tmp2;
5711 					alu.src[1].chan = 0;
5712 				} else {
5713 					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5714 				}
5715 
5716 				alu.last = (j == 3);
5717 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5718 					return r;
5719 			}
5720 		} else {
5721 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5722 			alu.op = ALU_OP2_MULHI_UINT;
5723 
5724 			alu.dst.sel = tmp0;
5725 			alu.dst.chan = 2;
5726 			alu.dst.write = 1;
5727 
5728 			alu.src[0].sel = tmp0;
5729 			alu.src[0].chan = 0;
5730 
5731 			if (signed_op) {
5732 				alu.src[1].sel = tmp2;
5733 				alu.src[1].chan = 0;
5734 			} else {
5735 				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5736 			}
5737 
5738 			alu.last = 1;
5739 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5740 				return r;
5741 		}
5742 
5743 		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
5744 		if (ctx->bc->chip_class == CAYMAN) {
5745 			for (j = 0 ; j < 4; j++) {
5746 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5747 				alu.op = ALU_OP2_MULLO_UINT;
5748 
5749 				alu.dst.sel = tmp0;
5750 				alu.dst.chan = j;
5751 				alu.dst.write = (j == 1);
5752 
5753 				if (signed_op) {
5754 					alu.src[0].sel = tmp2;
5755 					alu.src[0].chan = 1;
5756 				} else {
5757 					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5758 				}
5759 
5760 				alu.src[1].sel = tmp0;
5761 				alu.src[1].chan = 2;
5762 
5763 				alu.last = (j == 3);
5764 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5765 					return r;
5766 			}
5767 		} else {
5768 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5769 			alu.op = ALU_OP2_MULLO_UINT;
5770 
5771 			alu.dst.sel = tmp0;
5772 			alu.dst.chan = 1;
5773 			alu.dst.write = 1;
5774 
5775 			if (signed_op) {
5776 				alu.src[0].sel = tmp2;
5777 				alu.src[0].chan = 1;
5778 			} else {
5779 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5780 			}
5781 
5782 			alu.src[1].sel = tmp0;
5783 			alu.src[1].chan = 2;
5784 
5785 			alu.last = 1;
5786 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5787 				return r;
5788 		}
5789 
5790 		/* 12. tmp0.w = src1 - tmp0.y       = r */
5791 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5792 		alu.op = ALU_OP2_SUB_INT;
5793 
5794 		alu.dst.sel = tmp0;
5795 		alu.dst.chan = 3;
5796 		alu.dst.write = 1;
5797 
5798 		if (signed_op) {
5799 			alu.src[0].sel = tmp2;
5800 			alu.src[0].chan = 0;
5801 		} else {
5802 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5803 		}
5804 
5805 		alu.src[1].sel = tmp0;
5806 		alu.src[1].chan = 1;
5807 
5808 		alu.last = 1;
5809 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5810 			return r;
5811 
5812 		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
5813 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5814 		alu.op = ALU_OP2_SETGE_UINT;
5815 
5816 		alu.dst.sel = tmp1;
5817 		alu.dst.chan = 0;
5818 		alu.dst.write = 1;
5819 
5820 		alu.src[0].sel = tmp0;
5821 		alu.src[0].chan = 3;
5822 		if (signed_op) {
5823 			alu.src[1].sel = tmp2;
5824 			alu.src[1].chan = 1;
5825 		} else {
5826 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5827 		}
5828 
5829 		alu.last = 1;
5830 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5831 			return r;
5832 
5833 		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
5834 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5835 		alu.op = ALU_OP2_SETGE_UINT;
5836 
5837 		alu.dst.sel = tmp1;
5838 		alu.dst.chan = 1;
5839 		alu.dst.write = 1;
5840 
5841 		if (signed_op) {
5842 			alu.src[0].sel = tmp2;
5843 			alu.src[0].chan = 0;
5844 		} else {
5845 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5846 		}
5847 
5848 		alu.src[1].sel = tmp0;
5849 		alu.src[1].chan = 1;
5850 
5851 		alu.last = 1;
5852 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5853 			return r;
5854 
5855 		if (mod) { /* UMOD */
5856 
5857 			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
5858 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5859 			alu.op = ALU_OP2_SUB_INT;
5860 
5861 			alu.dst.sel = tmp1;
5862 			alu.dst.chan = 2;
5863 			alu.dst.write = 1;
5864 
5865 			alu.src[0].sel = tmp0;
5866 			alu.src[0].chan = 3;
5867 
5868 			if (signed_op) {
5869 				alu.src[1].sel = tmp2;
5870 				alu.src[1].chan = 1;
5871 			} else {
5872 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5873 			}
5874 
5875 			alu.last = 1;
5876 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5877 				return r;
5878 
5879 			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
5880 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5881 			alu.op = ALU_OP2_ADD_INT;
5882 
5883 			alu.dst.sel = tmp1;
5884 			alu.dst.chan = 3;
5885 			alu.dst.write = 1;
5886 
5887 			alu.src[0].sel = tmp0;
5888 			alu.src[0].chan = 3;
5889 			if (signed_op) {
5890 				alu.src[1].sel = tmp2;
5891 				alu.src[1].chan = 1;
5892 			} else {
5893 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5894 			}
5895 
5896 			alu.last = 1;
5897 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5898 				return r;
5899 
5900 		} else { /* UDIV */
5901 
5902 			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
5903 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5904 			alu.op = ALU_OP2_ADD_INT;
5905 
5906 			alu.dst.sel = tmp1;
5907 			alu.dst.chan = 2;
5908 			alu.dst.write = 1;
5909 
5910 			alu.src[0].sel = tmp0;
5911 			alu.src[0].chan = 2;
5912 			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5913 
5914 			alu.last = 1;
5915 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5916 				return r;
5917 
5918 			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
5919 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5920 			alu.op = ALU_OP2_ADD_INT;
5921 
5922 			alu.dst.sel = tmp1;
5923 			alu.dst.chan = 3;
5924 			alu.dst.write = 1;
5925 
5926 			alu.src[0].sel = tmp0;
5927 			alu.src[0].chan = 2;
5928 			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5929 
5930 			alu.last = 1;
5931 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5932 				return r;
5933 
5934 		}
5935 
5936 		/* 17. tmp1.x = tmp1.x & tmp1.y */
5937 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5938 		alu.op = ALU_OP2_AND_INT;
5939 
5940 		alu.dst.sel = tmp1;
5941 		alu.dst.chan = 0;
5942 		alu.dst.write = 1;
5943 
5944 		alu.src[0].sel = tmp1;
5945 		alu.src[0].chan = 0;
5946 		alu.src[1].sel = tmp1;
5947 		alu.src[1].chan = 1;
5948 
5949 		alu.last = 1;
5950 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5951 			return r;
5952 
5953 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
5954 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
5955 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5956 		alu.op = ALU_OP3_CNDE_INT;
5957 		alu.is_op3 = 1;
5958 
5959 		alu.dst.sel = tmp0;
5960 		alu.dst.chan = 2;
5961 		alu.dst.write = 1;
5962 
5963 		alu.src[0].sel = tmp1;
5964 		alu.src[0].chan = 0;
5965 		alu.src[1].sel = tmp0;
5966 		alu.src[1].chan = mod ? 3 : 2;
5967 		alu.src[2].sel = tmp1;
5968 		alu.src[2].chan = 2;
5969 
5970 		alu.last = 1;
5971 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5972 			return r;
5973 
5974 		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5975 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5976 		alu.op = ALU_OP3_CNDE_INT;
5977 		alu.is_op3 = 1;
5978 
5979 		if (signed_op) {
5980 			alu.dst.sel = tmp0;
5981 			alu.dst.chan = 2;
5982 			alu.dst.write = 1;
5983 		} else {
5984 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5985 		}
5986 
5987 		alu.src[0].sel = tmp1;
5988 		alu.src[0].chan = 1;
5989 		alu.src[1].sel = tmp1;
5990 		alu.src[1].chan = 3;
5991 		alu.src[2].sel = tmp0;
5992 		alu.src[2].chan = 2;
5993 
5994 		alu.last = 1;
5995 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5996 			return r;
5997 
5998 		if (signed_op) {
5999 
6000 			/* fix the sign of the result */
6001 
6002 			if (mod) {
6003 
6004 				/* tmp0.x = -tmp0.z */
6005 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6006 				alu.op = ALU_OP2_SUB_INT;
6007 
6008 				alu.dst.sel = tmp0;
6009 				alu.dst.chan = 0;
6010 				alu.dst.write = 1;
6011 
6012 				alu.src[0].sel = V_SQ_ALU_SRC_0;
6013 				alu.src[1].sel = tmp0;
6014 				alu.src[1].chan = 2;
6015 
6016 				alu.last = 1;
6017 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6018 					return r;
6019 
6020 				/* sign of the remainder is the same as the sign of src0 */
6021 				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6022 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6023 				alu.op = ALU_OP3_CNDGE_INT;
6024 				alu.is_op3 = 1;
6025 
6026 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6027 
6028 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6029 				alu.src[1].sel = tmp0;
6030 				alu.src[1].chan = 2;
6031 				alu.src[2].sel = tmp0;
6032 				alu.src[2].chan = 0;
6033 
6034 				alu.last = 1;
6035 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6036 					return r;
6037 
6038 			} else {
6039 
6040 				/* tmp0.x = -tmp0.z */
6041 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6042 				alu.op = ALU_OP2_SUB_INT;
6043 
6044 				alu.dst.sel = tmp0;
6045 				alu.dst.chan = 0;
6046 				alu.dst.write = 1;
6047 
6048 				alu.src[0].sel = V_SQ_ALU_SRC_0;
6049 				alu.src[1].sel = tmp0;
6050 				alu.src[1].chan = 2;
6051 
6052 				alu.last = 1;
6053 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6054 					return r;
6055 
6056 				/* fix the quotient sign (same as the sign of src0*src1) */
6057 				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6058 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6059 				alu.op = ALU_OP3_CNDGE_INT;
6060 				alu.is_op3 = 1;
6061 
6062 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6063 
6064 				alu.src[0].sel = tmp2;
6065 				alu.src[0].chan = 2;
6066 				alu.src[1].sel = tmp0;
6067 				alu.src[1].chan = 2;
6068 				alu.src[2].sel = tmp0;
6069 				alu.src[2].chan = 0;
6070 
6071 				alu.last = 1;
6072 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6073 					return r;
6074 			}
6075 		}
6076 	}
6077 	return 0;
6078 }
6079 
tgsi_udiv(struct r600_shader_ctx * ctx)6080 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6081 {
6082 	return tgsi_divmod(ctx, 0, 0);
6083 }
6084 
tgsi_umod(struct r600_shader_ctx * ctx)6085 static int tgsi_umod(struct r600_shader_ctx *ctx)
6086 {
6087 	return tgsi_divmod(ctx, 1, 0);
6088 }
6089 
tgsi_idiv(struct r600_shader_ctx * ctx)6090 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6091 {
6092 	return tgsi_divmod(ctx, 0, 1);
6093 }
6094 
tgsi_imod(struct r600_shader_ctx * ctx)6095 static int tgsi_imod(struct r600_shader_ctx *ctx)
6096 {
6097 	return tgsi_divmod(ctx, 1, 1);
6098 }
6099 
6100 
tgsi_f2i(struct r600_shader_ctx * ctx)6101 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6102 {
6103 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6104 	struct r600_bytecode_alu alu;
6105 	int i, r;
6106 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6107 	int last_inst = tgsi_last_instruction(write_mask);
6108 
6109 	for (i = 0; i < 4; i++) {
6110 		if (!(write_mask & (1<<i)))
6111 			continue;
6112 
6113 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6114 		alu.op = ALU_OP1_TRUNC;
6115 
6116 		alu.dst.sel = ctx->temp_reg;
6117 		alu.dst.chan = i;
6118 		alu.dst.write = 1;
6119 
6120 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6121 		if (i == last_inst)
6122 			alu.last = 1;
6123 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6124 		if (r)
6125 			return r;
6126 	}
6127 
6128 	for (i = 0; i < 4; i++) {
6129 		if (!(write_mask & (1<<i)))
6130 			continue;
6131 
6132 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6133 		alu.op = ctx->inst_info->op;
6134 
6135 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6136 
6137 		alu.src[0].sel = ctx->temp_reg;
6138 		alu.src[0].chan = i;
6139 
6140 		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6141 			alu.last = 1;
6142 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6143 		if (r)
6144 			return r;
6145 	}
6146 
6147 	return 0;
6148 }
6149 
tgsi_iabs(struct r600_shader_ctx * ctx)6150 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6151 {
6152 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6153 	struct r600_bytecode_alu alu;
6154 	int i, r;
6155 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6156 	int last_inst = tgsi_last_instruction(write_mask);
6157 
6158 	/* tmp = -src */
6159 	for (i = 0; i < 4; i++) {
6160 		if (!(write_mask & (1<<i)))
6161 			continue;
6162 
6163 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6164 		alu.op = ALU_OP2_SUB_INT;
6165 
6166 		alu.dst.sel = ctx->temp_reg;
6167 		alu.dst.chan = i;
6168 		alu.dst.write = 1;
6169 
6170 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6171 		alu.src[0].sel = V_SQ_ALU_SRC_0;
6172 
6173 		if (i == last_inst)
6174 			alu.last = 1;
6175 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6176 		if (r)
6177 			return r;
6178 	}
6179 
6180 	/* dst = (src >= 0 ? src : tmp) */
6181 	for (i = 0; i < 4; i++) {
6182 		if (!(write_mask & (1<<i)))
6183 			continue;
6184 
6185 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6186 		alu.op = ALU_OP3_CNDGE_INT;
6187 		alu.is_op3 = 1;
6188 		alu.dst.write = 1;
6189 
6190 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6191 
6192 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6193 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6194 		alu.src[2].sel = ctx->temp_reg;
6195 		alu.src[2].chan = i;
6196 
6197 		if (i == last_inst)
6198 			alu.last = 1;
6199 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6200 		if (r)
6201 			return r;
6202 	}
6203 	return 0;
6204 }
6205 
tgsi_issg(struct r600_shader_ctx * ctx)6206 static int tgsi_issg(struct r600_shader_ctx *ctx)
6207 {
6208 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6209 	struct r600_bytecode_alu alu;
6210 	int i, r;
6211 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6212 	int last_inst = tgsi_last_instruction(write_mask);
6213 
6214 	/* tmp = (src >= 0 ? src : -1) */
6215 	for (i = 0; i < 4; i++) {
6216 		if (!(write_mask & (1<<i)))
6217 			continue;
6218 
6219 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6220 		alu.op = ALU_OP3_CNDGE_INT;
6221 		alu.is_op3 = 1;
6222 
6223 		alu.dst.sel = ctx->temp_reg;
6224 		alu.dst.chan = i;
6225 		alu.dst.write = 1;
6226 
6227 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6228 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6229 		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6230 
6231 		if (i == last_inst)
6232 			alu.last = 1;
6233 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6234 		if (r)
6235 			return r;
6236 	}
6237 
6238 	/* dst = (tmp > 0 ? 1 : tmp) */
6239 	for (i = 0; i < 4; i++) {
6240 		if (!(write_mask & (1<<i)))
6241 			continue;
6242 
6243 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6244 		alu.op = ALU_OP3_CNDGT_INT;
6245 		alu.is_op3 = 1;
6246 		alu.dst.write = 1;
6247 
6248 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6249 
6250 		alu.src[0].sel = ctx->temp_reg;
6251 		alu.src[0].chan = i;
6252 
6253 		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6254 
6255 		alu.src[2].sel = ctx->temp_reg;
6256 		alu.src[2].chan = i;
6257 
6258 		if (i == last_inst)
6259 			alu.last = 1;
6260 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6261 		if (r)
6262 			return r;
6263 	}
6264 	return 0;
6265 }
6266 
6267 
6268 
tgsi_ssg(struct r600_shader_ctx * ctx)6269 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6270 {
6271 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6272 	struct r600_bytecode_alu alu;
6273 	int i, r;
6274 
6275 	/* tmp = (src > 0 ? 1 : src) */
6276 	for (i = 0; i < 4; i++) {
6277 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6278 		alu.op = ALU_OP3_CNDGT;
6279 		alu.is_op3 = 1;
6280 
6281 		alu.dst.sel = ctx->temp_reg;
6282 		alu.dst.chan = i;
6283 
6284 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6285 		alu.src[1].sel = V_SQ_ALU_SRC_1;
6286 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6287 
6288 		if (i == 3)
6289 			alu.last = 1;
6290 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6291 		if (r)
6292 			return r;
6293 	}
6294 
6295 	/* dst = (-tmp > 0 ? -1 : tmp) */
6296 	for (i = 0; i < 4; i++) {
6297 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6298 		alu.op = ALU_OP3_CNDGT;
6299 		alu.is_op3 = 1;
6300 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6301 
6302 		alu.src[0].sel = ctx->temp_reg;
6303 		alu.src[0].chan = i;
6304 		alu.src[0].neg = 1;
6305 
6306 		alu.src[1].sel = V_SQ_ALU_SRC_1;
6307 		alu.src[1].neg = 1;
6308 
6309 		alu.src[2].sel = ctx->temp_reg;
6310 		alu.src[2].chan = i;
6311 
6312 		if (i == 3)
6313 			alu.last = 1;
6314 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6315 		if (r)
6316 			return r;
6317 	}
6318 	return 0;
6319 }
6320 
tgsi_bfi(struct r600_shader_ctx * ctx)6321 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6322 {
6323 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6324 	struct r600_bytecode_alu alu;
6325 	int i, r, t1, t2;
6326 
6327 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6328 	int last_inst = tgsi_last_instruction(write_mask);
6329 
6330 	t1 = r600_get_temp(ctx);
6331 
6332 	for (i = 0; i < 4; i++) {
6333 		if (!(write_mask & (1<<i)))
6334 			continue;
6335 
6336 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6337 		alu.op = ALU_OP2_SETGE_INT;
6338 		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6339 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6340 		alu.src[1].value = 32;
6341 		alu.dst.sel = ctx->temp_reg;
6342 		alu.dst.chan = i;
6343 		alu.dst.write = 1;
6344 		alu.last = i == last_inst;
6345 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6346 		if (r)
6347 			return r;
6348 	}
6349 
6350 	for (i = 0; i < 4; i++) {
6351 		if (!(write_mask & (1<<i)))
6352 			continue;
6353 
6354 		/* create mask tmp */
6355 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6356 		alu.op = ALU_OP2_BFM_INT;
6357 		alu.dst.sel = t1;
6358 		alu.dst.chan = i;
6359 		alu.dst.write = 1;
6360 		alu.last = i == last_inst;
6361 
6362 		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6363 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6364 
6365 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6366 		if (r)
6367 			return r;
6368 	}
6369 
6370 	t2 = r600_get_temp(ctx);
6371 
6372 	for (i = 0; i < 4; i++) {
6373 		if (!(write_mask & (1<<i)))
6374 			continue;
6375 
6376 		/* shift insert left */
6377 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6378 		alu.op = ALU_OP2_LSHL_INT;
6379 		alu.dst.sel = t2;
6380 		alu.dst.chan = i;
6381 		alu.dst.write = 1;
6382 		alu.last = i == last_inst;
6383 
6384 		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6385 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6386 
6387 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6388 		if (r)
6389 			return r;
6390 	}
6391 
6392 	for (i = 0; i < 4; i++) {
6393 		if (!(write_mask & (1<<i)))
6394 			continue;
6395 
6396 		/* actual bitfield insert */
6397 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6398 		alu.op = ALU_OP3_BFI_INT;
6399 		alu.is_op3 = 1;
6400 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6401 		alu.dst.chan = i;
6402 		alu.dst.write = 1;
6403 		alu.last = i == last_inst;
6404 
6405 		alu.src[0].sel = t1;
6406 		alu.src[0].chan = i;
6407 		alu.src[1].sel = t2;
6408 		alu.src[1].chan = i;
6409 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6410 
6411 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6412 		if (r)
6413 			return r;
6414 	}
6415 
6416 	for (i = 0; i < 4; i++) {
6417 		if (!(write_mask & (1<<i)))
6418 			continue;
6419 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6420 		alu.op = ALU_OP3_CNDE_INT;
6421 		alu.is_op3 = 1;
6422 		alu.src[0].sel = ctx->temp_reg;
6423 		alu.src[0].chan = i;
6424 		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6425 
6426 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6427 
6428 		alu.src[1].sel = alu.dst.sel;
6429 		alu.src[1].chan = i;
6430 
6431 		alu.last = i == last_inst;
6432 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6433 		if (r)
6434 			return r;
6435 	}
6436 	return 0;
6437 }
6438 
tgsi_msb(struct r600_shader_ctx * ctx)6439 static int tgsi_msb(struct r600_shader_ctx *ctx)
6440 {
6441 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6442 	struct r600_bytecode_alu alu;
6443 	int i, r, t1, t2;
6444 
6445 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6446 	int last_inst = tgsi_last_instruction(write_mask);
6447 
6448 	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6449 		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6450 
6451 	t1 = ctx->temp_reg;
6452 
6453 	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6454 	for (i = 0; i < 4; i++) {
6455 		if (!(write_mask & (1<<i)))
6456 			continue;
6457 
6458 		/* t1 = FFBH_INT / FFBH_UINT */
6459 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6460 		alu.op = ctx->inst_info->op;
6461 		alu.dst.sel = t1;
6462 		alu.dst.chan = i;
6463 		alu.dst.write = 1;
6464 		alu.last = i == last_inst;
6465 
6466 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6467 
6468 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6469 		if (r)
6470 			return r;
6471 	}
6472 
6473 	t2 = r600_get_temp(ctx);
6474 
6475 	for (i = 0; i < 4; i++) {
6476 		if (!(write_mask & (1<<i)))
6477 			continue;
6478 
6479 		/* t2 = 31 - t1 */
6480 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6481 		alu.op = ALU_OP2_SUB_INT;
6482 		alu.dst.sel = t2;
6483 		alu.dst.chan = i;
6484 		alu.dst.write = 1;
6485 		alu.last = i == last_inst;
6486 
6487 		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6488 		alu.src[0].value = 31;
6489 		alu.src[1].sel = t1;
6490 		alu.src[1].chan = i;
6491 
6492 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6493 		if (r)
6494 			return r;
6495 	}
6496 
6497 	for (i = 0; i < 4; i++) {
6498 		if (!(write_mask & (1<<i)))
6499 			continue;
6500 
6501 		/* result = t1 >= 0 ? t2 : t1 */
6502 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6503 		alu.op = ALU_OP3_CNDGE_INT;
6504 		alu.is_op3 = 1;
6505 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6506 		alu.dst.chan = i;
6507 		alu.dst.write = 1;
6508 		alu.last = i == last_inst;
6509 
6510 		alu.src[0].sel = t1;
6511 		alu.src[0].chan = i;
6512 		alu.src[1].sel = t2;
6513 		alu.src[1].chan = i;
6514 		alu.src[2].sel = t1;
6515 		alu.src[2].chan = i;
6516 
6517 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6518 		if (r)
6519 			return r;
6520 	}
6521 
6522 	return 0;
6523 }
6524 
tgsi_interp_egcm(struct r600_shader_ctx * ctx)6525 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6526 {
6527 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6528 	struct r600_bytecode_alu alu;
6529 	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6530 	unsigned location;
6531 	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6532 
6533 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6534 
6535 	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6536 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6537 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6538 		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6539 	}
6540 	else {
6541 		location = TGSI_INTERPOLATE_LOC_CENTROID;
6542 	}
6543 
6544 	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6545 	if (k < 0)
6546 		k = 0;
6547 	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6548 	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6549 
6550 	/* NOTE: currently offset is not perspective correct */
6551 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6552 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6553 		int sample_gpr = -1;
6554 		int gradientsH, gradientsV;
6555 		struct r600_bytecode_tex tex;
6556 
6557 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6558 			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6559 		}
6560 
6561 		gradientsH = r600_get_temp(ctx);
6562 		gradientsV = r600_get_temp(ctx);
6563 		for (i = 0; i < 2; i++) {
6564 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6565 			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6566 			tex.src_gpr = interp_gpr;
6567 			tex.src_sel_x = interp_base_chan + 0;
6568 			tex.src_sel_y = interp_base_chan + 1;
6569 			tex.src_sel_z = 0;
6570 			tex.src_sel_w = 0;
6571 			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6572 			tex.dst_sel_x = 0;
6573 			tex.dst_sel_y = 1;
6574 			tex.dst_sel_z = 7;
6575 			tex.dst_sel_w = 7;
6576 			tex.inst_mod = 1; // Use per pixel gradient calculation
6577 			tex.sampler_id = 0;
6578 			tex.resource_id = tex.sampler_id;
6579 			r = r600_bytecode_add_tex(ctx->bc, &tex);
6580 			if (r)
6581 				return r;
6582 		}
6583 
6584 		for (i = 0; i < 2; i++) {
6585 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6586 			alu.op = ALU_OP3_MULADD;
6587 			alu.is_op3 = 1;
6588 			alu.src[0].sel = gradientsH;
6589 			alu.src[0].chan = i;
6590 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6591 				alu.src[1].sel = sample_gpr;
6592 				alu.src[1].chan = 2;
6593 			}
6594 			else {
6595 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6596 			}
6597 			alu.src[2].sel = interp_gpr;
6598 			alu.src[2].chan = interp_base_chan + i;
6599 			alu.dst.sel = ctx->temp_reg;
6600 			alu.dst.chan = i;
6601 			alu.last = i == 1;
6602 
6603 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6604 			if (r)
6605 				return r;
6606 		}
6607 
6608 		for (i = 0; i < 2; i++) {
6609 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6610 			alu.op = ALU_OP3_MULADD;
6611 			alu.is_op3 = 1;
6612 			alu.src[0].sel = gradientsV;
6613 			alu.src[0].chan = i;
6614 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6615 				alu.src[1].sel = sample_gpr;
6616 				alu.src[1].chan = 3;
6617 			}
6618 			else {
6619 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6620 			}
6621 			alu.src[2].sel = ctx->temp_reg;
6622 			alu.src[2].chan = i;
6623 			alu.dst.sel = ctx->temp_reg;
6624 			alu.dst.chan = i;
6625 			alu.last = i == 1;
6626 
6627 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6628 			if (r)
6629 				return r;
6630 		}
6631 	}
6632 
6633 	tmp = r600_get_temp(ctx);
6634 	for (i = 0; i < 8; i++) {
6635 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6636 		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6637 
6638 		alu.dst.sel = tmp;
6639 		if ((i > 1 && i < 6)) {
6640 			alu.dst.write = 1;
6641 		}
6642 		else {
6643 			alu.dst.write = 0;
6644 		}
6645 		alu.dst.chan = i % 4;
6646 
6647 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6648 			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6649 			alu.src[0].sel = ctx->temp_reg;
6650 			alu.src[0].chan = 1 - (i % 2);
6651 		} else {
6652 			alu.src[0].sel = interp_gpr;
6653 			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6654 		}
6655 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6656 		alu.src[1].chan = 0;
6657 
6658 		alu.last = i % 4 == 3;
6659 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
6660 
6661 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6662 		if (r)
6663 			return r;
6664 	}
6665 
6666 	// INTERP can't swizzle dst
6667 	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6668 	for (i = 0; i <= lasti; i++) {
6669 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6670 			continue;
6671 
6672 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6673 		alu.op = ALU_OP1_MOV;
6674 		alu.src[0].sel = tmp;
6675 		alu.src[0].chan = ctx->src[0].swizzle[i];
6676 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6677 		alu.dst.write = 1;
6678 		alu.last = i == lasti;
6679 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6680 		if (r)
6681 			return r;
6682 	}
6683 
6684 	return 0;
6685 }
6686 
6687 
tgsi_helper_copy(struct r600_shader_ctx * ctx,struct tgsi_full_instruction * inst)6688 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6689 {
6690 	struct r600_bytecode_alu alu;
6691 	int i, r;
6692 
6693 	for (i = 0; i < 4; i++) {
6694 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6695 		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6696 			alu.op = ALU_OP0_NOP;
6697 			alu.dst.chan = i;
6698 		} else {
6699 			alu.op = ALU_OP1_MOV;
6700 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6701 			alu.src[0].sel = ctx->temp_reg;
6702 			alu.src[0].chan = i;
6703 		}
6704 		if (i == 3) {
6705 			alu.last = 1;
6706 		}
6707 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6708 		if (r)
6709 			return r;
6710 	}
6711 	return 0;
6712 }
6713 
tgsi_make_src_for_op3(struct r600_shader_ctx * ctx,unsigned temp,int chan,struct r600_bytecode_alu_src * bc_src,const struct r600_shader_src * shader_src)6714 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6715                                  unsigned temp, int chan,
6716                                  struct r600_bytecode_alu_src *bc_src,
6717                                  const struct r600_shader_src *shader_src)
6718 {
6719 	struct r600_bytecode_alu alu;
6720 	int r;
6721 
6722 	r600_bytecode_src(bc_src, shader_src, chan);
6723 
6724 	/* op3 operands don't support abs modifier */
6725 	if (bc_src->abs) {
6726 		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
6727 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6728 		alu.op = ALU_OP1_MOV;
6729 		alu.dst.sel = temp;
6730 		alu.dst.chan = chan;
6731 		alu.dst.write = 1;
6732 
6733 		alu.src[0] = *bc_src;
6734 		alu.last = true; // sufficient?
6735 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6736 		if (r)
6737 			return r;
6738 
6739 		memset(bc_src, 0, sizeof(*bc_src));
6740 		bc_src->sel = temp;
6741 		bc_src->chan = chan;
6742 	}
6743 	return 0;
6744 }
6745 
tgsi_op3_dst(struct r600_shader_ctx * ctx,int dst)6746 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
6747 {
6748 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6749 	struct r600_bytecode_alu alu;
6750 	int i, j, r;
6751 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6752 	int temp_regs[4];
6753 	unsigned op = ctx->inst_info->op;
6754 
6755 	if (op == ALU_OP3_MULADD_IEEE &&
6756 	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6757 		op = ALU_OP3_MULADD;
6758 
6759 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6760 		temp_regs[j] = 0;
6761 		if (ctx->src[j].abs)
6762 			temp_regs[j] = r600_get_temp(ctx);
6763 	}
6764 	for (i = 0; i < lasti + 1; i++) {
6765 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6766 			continue;
6767 
6768 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6769 		alu.op = op;
6770 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6771 			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6772 			if (r)
6773 				return r;
6774 		}
6775 
6776 		if (dst == -1) {
6777 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6778 		} else {
6779 			alu.dst.sel = dst;
6780 		}
6781 		alu.dst.chan = i;
6782 		alu.dst.write = 1;
6783 		alu.is_op3 = 1;
6784 		if (i == lasti) {
6785 			alu.last = 1;
6786 		}
6787 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6788 		if (r)
6789 			return r;
6790 	}
6791 	return 0;
6792 }
6793 
tgsi_op3(struct r600_shader_ctx * ctx)6794 static int tgsi_op3(struct r600_shader_ctx *ctx)
6795 {
6796 	return tgsi_op3_dst(ctx, -1);
6797 }
6798 
tgsi_dp(struct r600_shader_ctx * ctx)6799 static int tgsi_dp(struct r600_shader_ctx *ctx)
6800 {
6801 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6802 	struct r600_bytecode_alu alu;
6803 	int i, j, r;
6804 	unsigned op = ctx->inst_info->op;
6805 	if (op == ALU_OP2_DOT4_IEEE &&
6806 	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6807 		op = ALU_OP2_DOT4;
6808 
6809 	for (i = 0; i < 4; i++) {
6810 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6811 		alu.op = op;
6812 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6813 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6814 		}
6815 
6816 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6817 		alu.dst.chan = i;
6818 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6819 		/* handle some special cases */
6820 		switch (inst->Instruction.Opcode) {
6821 		case TGSI_OPCODE_DP2:
6822 			if (i > 1) {
6823 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6824 				alu.src[0].chan = alu.src[1].chan = 0;
6825 			}
6826 			break;
6827 		case TGSI_OPCODE_DP3:
6828 			if (i > 2) {
6829 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6830 				alu.src[0].chan = alu.src[1].chan = 0;
6831 			}
6832 			break;
6833 		default:
6834 			break;
6835 		}
6836 		if (i == 3) {
6837 			alu.last = 1;
6838 		}
6839 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6840 		if (r)
6841 			return r;
6842 	}
6843 	return 0;
6844 }
6845 
tgsi_tex_src_requires_loading(struct r600_shader_ctx * ctx,unsigned index)6846 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6847 						    unsigned index)
6848 {
6849 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6850 	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6851 		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6852 		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6853 		ctx->src[index].neg || ctx->src[index].abs ||
6854 		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6855 }
6856 
tgsi_tex_get_src_gpr(struct r600_shader_ctx * ctx,unsigned index)6857 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6858 					unsigned index)
6859 {
6860 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6861 	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6862 }
6863 
do_vtx_fetch_inst(struct r600_shader_ctx * ctx,boolean src_requires_loading)6864 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6865 {
6866 	struct r600_bytecode_vtx vtx;
6867 	struct r600_bytecode_alu alu;
6868 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6869 	int src_gpr, r, i;
6870 	int id = tgsi_tex_get_src_gpr(ctx, 1);
6871 	int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6872 
6873 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6874 	if (src_requires_loading) {
6875 		for (i = 0; i < 4; i++) {
6876 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6877 			alu.op = ALU_OP1_MOV;
6878 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6879 			alu.dst.sel = ctx->temp_reg;
6880 			alu.dst.chan = i;
6881 			if (i == 3)
6882 				alu.last = 1;
6883 			alu.dst.write = 1;
6884 			r = r600_bytecode_add_alu(ctx->bc, &alu);
6885 			if (r)
6886 				return r;
6887 		}
6888 		src_gpr = ctx->temp_reg;
6889 	}
6890 
6891 	memset(&vtx, 0, sizeof(vtx));
6892 	vtx.op = FETCH_OP_VFETCH;
6893 	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6894 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6895 	vtx.src_gpr = src_gpr;
6896 	vtx.mega_fetch_count = 16;
6897 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6898 	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
6899 	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
6900 	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
6901 	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
6902 	vtx.use_const_fields = 1;
6903 	vtx.buffer_index_mode = sampler_index_mode;
6904 
6905 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6906 		return r;
6907 
6908 	if (ctx->bc->chip_class >= EVERGREEN)
6909 		return 0;
6910 
6911 	for (i = 0; i < 4; i++) {
6912 		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6913 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6914 			continue;
6915 
6916 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6917 		alu.op = ALU_OP2_AND_INT;
6918 
6919 		alu.dst.chan = i;
6920 		alu.dst.sel = vtx.dst_gpr;
6921 		alu.dst.write = 1;
6922 
6923 		alu.src[0].sel = vtx.dst_gpr;
6924 		alu.src[0].chan = i;
6925 
6926 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6927 		alu.src[1].sel += (id * 2);
6928 		alu.src[1].chan = i % 4;
6929 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6930 
6931 		if (i == lasti)
6932 			alu.last = 1;
6933 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6934 		if (r)
6935 			return r;
6936 	}
6937 
6938 	if (inst->Dst[0].Register.WriteMask & 3) {
6939 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6940 		alu.op = ALU_OP2_OR_INT;
6941 
6942 		alu.dst.chan = 3;
6943 		alu.dst.sel = vtx.dst_gpr;
6944 		alu.dst.write = 1;
6945 
6946 		alu.src[0].sel = vtx.dst_gpr;
6947 		alu.src[0].chan = 3;
6948 
6949 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6950 		alu.src[1].chan = 0;
6951 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6952 
6953 		alu.last = 1;
6954 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6955 		if (r)
6956 			return r;
6957 	}
6958 	return 0;
6959 }
6960 
r600_do_buffer_txq(struct r600_shader_ctx * ctx,int reg_idx,int offset)6961 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
6962 {
6963 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6964 	int r;
6965 	int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
6966 	int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6967 
6968 	if (ctx->bc->chip_class < EVERGREEN) {
6969 		struct r600_bytecode_alu alu;
6970 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6971 		alu.op = ALU_OP1_MOV;
6972 		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6973 		/* r600 we have them at channel 2 of the second dword */
6974 		alu.src[0].sel += (id * 2) + 1;
6975 		alu.src[0].chan = 1;
6976 		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6977 		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6978 		alu.last = 1;
6979 		r = r600_bytecode_add_alu(ctx->bc, &alu);
6980 		if (r)
6981 			return r;
6982 		return 0;
6983 	} else {
6984 		struct r600_bytecode_vtx vtx;
6985 		memset(&vtx, 0, sizeof(vtx));
6986 		vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
6987 		vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6988 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6989 		vtx.src_gpr = 0;
6990 		vtx.mega_fetch_count = 16; /* no idea here really... */
6991 		vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6992 		vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
6993 		vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;		/* SEL_Y */
6994 		vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;		/* SEL_Z */
6995 		vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;		/* SEL_W */
6996 		vtx.data_format = FMT_32_32_32_32;
6997 		vtx.buffer_index_mode = sampler_index_mode;
6998 
6999 		if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7000 			return r;
7001 		return 0;
7002 	}
7003 }
7004 
7005 
tgsi_tex(struct r600_shader_ctx * ctx)7006 static int tgsi_tex(struct r600_shader_ctx *ctx)
7007 {
7008 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7009 	struct r600_bytecode_tex tex;
7010 	struct r600_bytecode_alu alu;
7011 	unsigned src_gpr;
7012 	int r, i, j;
7013 	int opcode;
7014 	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7015 				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7016 				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7017 				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7018 
7019 	bool txf_add_offsets = inst->Texture.NumOffsets &&
7020 			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7021 			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7022 
7023 	/* Texture fetch instructions can only use gprs as source.
7024 	 * Also they cannot negate the source or take the absolute value */
7025 	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7026                                               tgsi_tex_src_requires_loading(ctx, 0)) ||
7027 					     read_compressed_msaa || txf_add_offsets;
7028 
7029 	boolean src_loaded = FALSE;
7030 	unsigned sampler_src_reg = 1;
7031 	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7032 	boolean has_txq_cube_array_z = false;
7033 	unsigned sampler_index_mode;
7034 
7035 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7036 	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7037 	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7038 		if (inst->Dst[0].Register.WriteMask & 4) {
7039 			ctx->shader->has_txq_cube_array_z_comp = true;
7040 			has_txq_cube_array_z = true;
7041 		}
7042 
7043 	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7044 	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7045 	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7046 	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7047 		sampler_src_reg = 2;
7048 
7049 	/* TGSI moves the sampler to src reg 3 for TXD */
7050 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7051 		sampler_src_reg = 3;
7052 
7053 	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7054 
7055 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7056 
7057 	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7058 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7059 			if (ctx->bc->chip_class < EVERGREEN)
7060 				ctx->shader->uses_tex_buffers = true;
7061 			return r600_do_buffer_txq(ctx, 1, 0);
7062 		}
7063 		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7064 			if (ctx->bc->chip_class < EVERGREEN)
7065 				ctx->shader->uses_tex_buffers = true;
7066 			return do_vtx_fetch_inst(ctx, src_requires_loading);
7067 		}
7068 	}
7069 
7070 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7071 		int out_chan;
7072 		/* Add perspective divide */
7073 		if (ctx->bc->chip_class == CAYMAN) {
7074 			out_chan = 2;
7075 			for (i = 0; i < 3; i++) {
7076 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7077 				alu.op = ALU_OP1_RECIP_IEEE;
7078 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7079 
7080 				alu.dst.sel = ctx->temp_reg;
7081 				alu.dst.chan = i;
7082 				if (i == 2)
7083 					alu.last = 1;
7084 				if (out_chan == i)
7085 					alu.dst.write = 1;
7086 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7087 				if (r)
7088 					return r;
7089 			}
7090 
7091 		} else {
7092 			out_chan = 3;
7093 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7094 			alu.op = ALU_OP1_RECIP_IEEE;
7095 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7096 
7097 			alu.dst.sel = ctx->temp_reg;
7098 			alu.dst.chan = out_chan;
7099 			alu.last = 1;
7100 			alu.dst.write = 1;
7101 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7102 			if (r)
7103 				return r;
7104 		}
7105 
7106 		for (i = 0; i < 3; i++) {
7107 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7108 			alu.op = ALU_OP2_MUL;
7109 			alu.src[0].sel = ctx->temp_reg;
7110 			alu.src[0].chan = out_chan;
7111 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7112 			alu.dst.sel = ctx->temp_reg;
7113 			alu.dst.chan = i;
7114 			alu.dst.write = 1;
7115 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7116 			if (r)
7117 				return r;
7118 		}
7119 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7120 		alu.op = ALU_OP1_MOV;
7121 		alu.src[0].sel = V_SQ_ALU_SRC_1;
7122 		alu.src[0].chan = 0;
7123 		alu.dst.sel = ctx->temp_reg;
7124 		alu.dst.chan = 3;
7125 		alu.last = 1;
7126 		alu.dst.write = 1;
7127 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7128 		if (r)
7129 			return r;
7130 		src_loaded = TRUE;
7131 		src_gpr = ctx->temp_reg;
7132 	}
7133 
7134 
7135 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7136 	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7137 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7138 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7139 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7140 
7141 		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7142 		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7143 
7144 		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7145 		for (i = 0; i < 4; i++) {
7146 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7147 			alu.op = ALU_OP2_CUBE;
7148 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7149 			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7150 			alu.dst.sel = ctx->temp_reg;
7151 			alu.dst.chan = i;
7152 			if (i == 3)
7153 				alu.last = 1;
7154 			alu.dst.write = 1;
7155 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7156 			if (r)
7157 				return r;
7158 		}
7159 
7160 		/* tmp1.z = RCP_e(|tmp1.z|) */
7161 		if (ctx->bc->chip_class == CAYMAN) {
7162 			for (i = 0; i < 3; i++) {
7163 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7164 				alu.op = ALU_OP1_RECIP_IEEE;
7165 				alu.src[0].sel = ctx->temp_reg;
7166 				alu.src[0].chan = 2;
7167 				alu.src[0].abs = 1;
7168 				alu.dst.sel = ctx->temp_reg;
7169 				alu.dst.chan = i;
7170 				if (i == 2)
7171 					alu.dst.write = 1;
7172 				if (i == 2)
7173 					alu.last = 1;
7174 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7175 				if (r)
7176 					return r;
7177 			}
7178 		} else {
7179 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7180 			alu.op = ALU_OP1_RECIP_IEEE;
7181 			alu.src[0].sel = ctx->temp_reg;
7182 			alu.src[0].chan = 2;
7183 			alu.src[0].abs = 1;
7184 			alu.dst.sel = ctx->temp_reg;
7185 			alu.dst.chan = 2;
7186 			alu.dst.write = 1;
7187 			alu.last = 1;
7188 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7189 			if (r)
7190 				return r;
7191 		}
7192 
7193 		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
7194 		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
7195 		 * muladd has no writemask, have to use another temp
7196 		 */
7197 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7198 		alu.op = ALU_OP3_MULADD;
7199 		alu.is_op3 = 1;
7200 
7201 		alu.src[0].sel = ctx->temp_reg;
7202 		alu.src[0].chan = 0;
7203 		alu.src[1].sel = ctx->temp_reg;
7204 		alu.src[1].chan = 2;
7205 
7206 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7207 		alu.src[2].chan = 0;
7208 		alu.src[2].value = u_bitcast_f2u(1.5f);
7209 
7210 		alu.dst.sel = ctx->temp_reg;
7211 		alu.dst.chan = 0;
7212 		alu.dst.write = 1;
7213 
7214 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7215 		if (r)
7216 			return r;
7217 
7218 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7219 		alu.op = ALU_OP3_MULADD;
7220 		alu.is_op3 = 1;
7221 
7222 		alu.src[0].sel = ctx->temp_reg;
7223 		alu.src[0].chan = 1;
7224 		alu.src[1].sel = ctx->temp_reg;
7225 		alu.src[1].chan = 2;
7226 
7227 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7228 		alu.src[2].chan = 0;
7229 		alu.src[2].value = u_bitcast_f2u(1.5f);
7230 
7231 		alu.dst.sel = ctx->temp_reg;
7232 		alu.dst.chan = 1;
7233 		alu.dst.write = 1;
7234 
7235 		alu.last = 1;
7236 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7237 		if (r)
7238 			return r;
7239 		/* write initial compare value into Z component
7240 		  - W src 0 for shadow cube
7241 		  - X src 1 for shadow cube array */
7242 		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7243 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7244 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7245 			alu.op = ALU_OP1_MOV;
7246 			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7247 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7248 			else
7249 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7250 			alu.dst.sel = ctx->temp_reg;
7251 			alu.dst.chan = 2;
7252 			alu.dst.write = 1;
7253 			alu.last = 1;
7254 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7255 			if (r)
7256 				return r;
7257 		}
7258 
7259 		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7260 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7261 			if (ctx->bc->chip_class >= EVERGREEN) {
7262 				int mytmp = r600_get_temp(ctx);
7263 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7264 				alu.op = ALU_OP1_MOV;
7265 				alu.src[0].sel = ctx->temp_reg;
7266 				alu.src[0].chan = 3;
7267 				alu.dst.sel = mytmp;
7268 				alu.dst.chan = 0;
7269 				alu.dst.write = 1;
7270 				alu.last = 1;
7271 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7272 				if (r)
7273 					return r;
7274 
7275 				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7276 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7277 				alu.op = ALU_OP3_MULADD;
7278 				alu.is_op3 = 1;
7279 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7280 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7281 				alu.src[1].chan = 0;
7282 				alu.src[1].value = u_bitcast_f2u(8.0f);
7283 				alu.src[2].sel = mytmp;
7284 				alu.src[2].chan = 0;
7285 				alu.dst.sel = ctx->temp_reg;
7286 				alu.dst.chan = 3;
7287 				alu.dst.write = 1;
7288 				alu.last = 1;
7289 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7290 				if (r)
7291 					return r;
7292 			} else if (ctx->bc->chip_class < EVERGREEN) {
7293 				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7294 				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7295 				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7296 				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7297 				tex.src_gpr = r600_get_temp(ctx);
7298 				tex.src_sel_x = 0;
7299 				tex.src_sel_y = 0;
7300 				tex.src_sel_z = 0;
7301 				tex.src_sel_w = 0;
7302 				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7303 				tex.coord_type_x = 1;
7304 				tex.coord_type_y = 1;
7305 				tex.coord_type_z = 1;
7306 				tex.coord_type_w = 1;
7307 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7308 				alu.op = ALU_OP1_MOV;
7309 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7310 				alu.dst.sel = tex.src_gpr;
7311 				alu.dst.chan = 0;
7312 				alu.last = 1;
7313 				alu.dst.write = 1;
7314 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7315 				if (r)
7316 					return r;
7317 
7318 				r = r600_bytecode_add_tex(ctx->bc, &tex);
7319 				if (r)
7320 					return r;
7321 			}
7322 
7323 		}
7324 
7325 		/* for cube forms of lod and bias we need to route things */
7326 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7327 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7328 		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7329 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7330 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7331 			alu.op = ALU_OP1_MOV;
7332 			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7333 			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7334 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7335 			else
7336 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7337 			alu.dst.sel = ctx->temp_reg;
7338 			alu.dst.chan = 2;
7339 			alu.last = 1;
7340 			alu.dst.write = 1;
7341 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7342 			if (r)
7343 				return r;
7344 		}
7345 
7346 		src_loaded = TRUE;
7347 		src_gpr = ctx->temp_reg;
7348 	}
7349 
7350 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7351 		int temp_h = 0, temp_v = 0;
7352 		int start_val = 0;
7353 
7354 		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7355 		if (src_loaded == TRUE)
7356 			start_val = 1;
7357 		else
7358 			src_loaded = TRUE;
7359 		for (i = start_val; i < 3; i++) {
7360 			int treg = r600_get_temp(ctx);
7361 
7362 			if (i == 0)
7363 				src_gpr = treg;
7364 			else if (i == 1)
7365 				temp_h = treg;
7366 			else
7367 				temp_v = treg;
7368 
7369 			for (j = 0; j < 4; j++) {
7370 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7371 				alu.op = ALU_OP1_MOV;
7372                                 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7373                                 alu.dst.sel = treg;
7374                                 alu.dst.chan = j;
7375                                 if (j == 3)
7376                                    alu.last = 1;
7377                                 alu.dst.write = 1;
7378                                 r = r600_bytecode_add_alu(ctx->bc, &alu);
7379                                 if (r)
7380                                     return r;
7381 			}
7382 		}
7383 		for (i = 1; i < 3; i++) {
7384 			/* set gradients h/v */
7385 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7386 			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7387 				FETCH_OP_SET_GRADIENTS_V;
7388 			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7389 			tex.sampler_index_mode = sampler_index_mode;
7390 			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7391 			tex.resource_index_mode = sampler_index_mode;
7392 
7393 			tex.src_gpr = (i == 1) ? temp_h : temp_v;
7394 			tex.src_sel_x = 0;
7395 			tex.src_sel_y = 1;
7396 			tex.src_sel_z = 2;
7397 			tex.src_sel_w = 3;
7398 
7399 			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7400 			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7401 			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7402 				tex.coord_type_x = 1;
7403 				tex.coord_type_y = 1;
7404 				tex.coord_type_z = 1;
7405 				tex.coord_type_w = 1;
7406 			}
7407 			r = r600_bytecode_add_tex(ctx->bc, &tex);
7408 			if (r)
7409 				return r;
7410 		}
7411 	}
7412 
7413 	if (src_requires_loading && !src_loaded) {
7414 		for (i = 0; i < 4; i++) {
7415 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7416 			alu.op = ALU_OP1_MOV;
7417 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7418 			alu.dst.sel = ctx->temp_reg;
7419 			alu.dst.chan = i;
7420 			if (i == 3)
7421 				alu.last = 1;
7422 			alu.dst.write = 1;
7423 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7424 			if (r)
7425 				return r;
7426 		}
7427 		src_loaded = TRUE;
7428 		src_gpr = ctx->temp_reg;
7429 	}
7430 
7431 	/* get offset values */
7432 	if (inst->Texture.NumOffsets) {
7433 		assert(inst->Texture.NumOffsets == 1);
7434 
7435 		/* The texture offset feature doesn't work with the TXF instruction
7436 		 * and must be emulated by adding the offset to the texture coordinates. */
7437 		if (txf_add_offsets) {
7438 			const struct tgsi_texture_offset *off = inst->TexOffsets;
7439 
7440 			switch (inst->Texture.Texture) {
7441 			case TGSI_TEXTURE_3D:
7442 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7443 				alu.op = ALU_OP2_ADD_INT;
7444 				alu.src[0].sel = src_gpr;
7445 				alu.src[0].chan = 2;
7446 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7447 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7448 				alu.dst.sel = src_gpr;
7449 				alu.dst.chan = 2;
7450 				alu.dst.write = 1;
7451 				alu.last = 1;
7452 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7453 				if (r)
7454 					return r;
7455 				/* fall through */
7456 
7457 			case TGSI_TEXTURE_2D:
7458 			case TGSI_TEXTURE_SHADOW2D:
7459 			case TGSI_TEXTURE_RECT:
7460 			case TGSI_TEXTURE_SHADOWRECT:
7461 			case TGSI_TEXTURE_2D_ARRAY:
7462 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7463 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7464 				alu.op = ALU_OP2_ADD_INT;
7465 				alu.src[0].sel = src_gpr;
7466 				alu.src[0].chan = 1;
7467 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7468 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7469 				alu.dst.sel = src_gpr;
7470 				alu.dst.chan = 1;
7471 				alu.dst.write = 1;
7472 				alu.last = 1;
7473 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7474 				if (r)
7475 					return r;
7476 				/* fall through */
7477 
7478 			case TGSI_TEXTURE_1D:
7479 			case TGSI_TEXTURE_SHADOW1D:
7480 			case TGSI_TEXTURE_1D_ARRAY:
7481 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7482 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7483 				alu.op = ALU_OP2_ADD_INT;
7484 				alu.src[0].sel = src_gpr;
7485 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7486 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7487 				alu.dst.sel = src_gpr;
7488 				alu.dst.write = 1;
7489 				alu.last = 1;
7490 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7491 				if (r)
7492 					return r;
7493 				break;
7494 				/* texture offsets do not apply to other texture targets */
7495 			}
7496 		} else {
7497 			switch (inst->Texture.Texture) {
7498 			case TGSI_TEXTURE_3D:
7499 				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7500 				/* fallthrough */
7501 			case TGSI_TEXTURE_2D:
7502 			case TGSI_TEXTURE_SHADOW2D:
7503 			case TGSI_TEXTURE_RECT:
7504 			case TGSI_TEXTURE_SHADOWRECT:
7505 			case TGSI_TEXTURE_2D_ARRAY:
7506 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7507 				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7508 				/* fallthrough */
7509 			case TGSI_TEXTURE_1D:
7510 			case TGSI_TEXTURE_SHADOW1D:
7511 			case TGSI_TEXTURE_1D_ARRAY:
7512 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7513 				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7514 			}
7515 		}
7516 	}
7517 
7518 	/* Obtain the sample index for reading a compressed MSAA color texture.
7519 	 * To read the FMASK, we use the ldfptr instruction, which tells us
7520 	 * where the samples are stored.
7521 	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7522 	 * which is the identity mapping. Each nibble says which physical sample
7523 	 * should be fetched to get that sample.
7524 	 *
7525 	 * Assume src.z contains the sample index. It should be modified like this:
7526 	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7527 	 * Then fetch the texel with src.
7528 	 */
7529 	if (read_compressed_msaa) {
7530 		unsigned sample_chan = 3;
7531 		unsigned temp = r600_get_temp(ctx);
7532 		assert(src_loaded);
7533 
7534 		/* temp.w = ldfptr() */
7535 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7536 		tex.op = FETCH_OP_LD;
7537 		tex.inst_mod = 1; /* to indicate this is ldfptr */
7538 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7539 		tex.sampler_index_mode = sampler_index_mode;
7540 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7541 		tex.resource_index_mode = sampler_index_mode;
7542 		tex.src_gpr = src_gpr;
7543 		tex.dst_gpr = temp;
7544 		tex.dst_sel_x = 7; /* mask out these components */
7545 		tex.dst_sel_y = 7;
7546 		tex.dst_sel_z = 7;
7547 		tex.dst_sel_w = 0; /* store X */
7548 		tex.src_sel_x = 0;
7549 		tex.src_sel_y = 1;
7550 		tex.src_sel_z = 2;
7551 		tex.src_sel_w = 3;
7552 		tex.offset_x = offset_x;
7553 		tex.offset_y = offset_y;
7554 		tex.offset_z = offset_z;
7555 		r = r600_bytecode_add_tex(ctx->bc, &tex);
7556 		if (r)
7557 			return r;
7558 
7559 		/* temp.x = sample_index*4 */
7560 		if (ctx->bc->chip_class == CAYMAN) {
7561 			for (i = 0 ; i < 4; i++) {
7562 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7563 				alu.op = ALU_OP2_MULLO_INT;
7564 				alu.src[0].sel = src_gpr;
7565 				alu.src[0].chan = sample_chan;
7566 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7567 				alu.src[1].value = 4;
7568 				alu.dst.sel = temp;
7569 				alu.dst.chan = i;
7570 				alu.dst.write = i == 0;
7571 				if (i == 3)
7572 					alu.last = 1;
7573 				r = r600_bytecode_add_alu(ctx->bc, &alu);
7574 				if (r)
7575 					return r;
7576 			}
7577 		} else {
7578 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7579 			alu.op = ALU_OP2_MULLO_INT;
7580 			alu.src[0].sel = src_gpr;
7581 			alu.src[0].chan = sample_chan;
7582 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7583 			alu.src[1].value = 4;
7584 			alu.dst.sel = temp;
7585 			alu.dst.chan = 0;
7586 			alu.dst.write = 1;
7587 			alu.last = 1;
7588 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7589 			if (r)
7590 				return r;
7591 		}
7592 
7593 		/* sample_index = temp.w >> temp.x */
7594 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7595 		alu.op = ALU_OP2_LSHR_INT;
7596 		alu.src[0].sel = temp;
7597 		alu.src[0].chan = 3;
7598 		alu.src[1].sel = temp;
7599 		alu.src[1].chan = 0;
7600 		alu.dst.sel = src_gpr;
7601 		alu.dst.chan = sample_chan;
7602 		alu.dst.write = 1;
7603 		alu.last = 1;
7604 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7605 		if (r)
7606 			return r;
7607 
7608 		/* sample_index & 0xF */
7609 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7610 		alu.op = ALU_OP2_AND_INT;
7611 		alu.src[0].sel = src_gpr;
7612 		alu.src[0].chan = sample_chan;
7613 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7614 		alu.src[1].value = 0xF;
7615 		alu.dst.sel = src_gpr;
7616 		alu.dst.chan = sample_chan;
7617 		alu.dst.write = 1;
7618 		alu.last = 1;
7619 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7620 		if (r)
7621 			return r;
7622 #if 0
7623 		/* visualize the FMASK */
7624 		for (i = 0; i < 4; i++) {
7625 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7626 			alu.op = ALU_OP1_INT_TO_FLT;
7627 			alu.src[0].sel = src_gpr;
7628 			alu.src[0].chan = sample_chan;
7629 			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7630 			alu.dst.chan = i;
7631 			alu.dst.write = 1;
7632 			alu.last = 1;
7633 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7634 			if (r)
7635 				return r;
7636 		}
7637 		return 0;
7638 #endif
7639 	}
7640 
7641 	/* does this shader want a num layers from TXQ for a cube array? */
7642 	if (has_txq_cube_array_z) {
7643 		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7644 
7645 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7646 		alu.op = ALU_OP1_MOV;
7647 
7648 		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7649 		if (ctx->bc->chip_class >= EVERGREEN) {
7650 			/* with eg each dword is number of cubes */
7651 			alu.src[0].sel += id / 4;
7652 			alu.src[0].chan = id % 4;
7653 		} else {
7654 			/* r600 we have them at channel 2 of the second dword */
7655 			alu.src[0].sel += (id * 2) + 1;
7656 			alu.src[0].chan = 2;
7657 		}
7658 		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7659 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7660 		alu.last = 1;
7661 		r = r600_bytecode_add_alu(ctx->bc, &alu);
7662 		if (r)
7663 			return r;
7664 		/* disable writemask from texture instruction */
7665 		inst->Dst[0].Register.WriteMask &= ~4;
7666 	}
7667 
7668 	opcode = ctx->inst_info->op;
7669 	if (opcode == FETCH_OP_GATHER4 &&
7670 		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7671 		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7672 		opcode = FETCH_OP_GATHER4_O;
7673 
7674 		/* GATHER4_O/GATHER4_C_O use offset values loaded by
7675 		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
7676 		   encoded in the instruction are ignored. */
7677 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7678 		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7679 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7680 		tex.sampler_index_mode = sampler_index_mode;
7681 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7682 		tex.resource_index_mode = sampler_index_mode;
7683 
7684 		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7685 		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7686 		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7687 		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7688 		tex.src_sel_w = 4;
7689 
7690 		tex.dst_sel_x = 7;
7691 		tex.dst_sel_y = 7;
7692 		tex.dst_sel_z = 7;
7693 		tex.dst_sel_w = 7;
7694 
7695 		r = r600_bytecode_add_tex(ctx->bc, &tex);
7696 		if (r)
7697 			return r;
7698 	}
7699 
7700 	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7701 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7702 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7703 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7704 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7705 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7706 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7707 		switch (opcode) {
7708 		case FETCH_OP_SAMPLE:
7709 			opcode = FETCH_OP_SAMPLE_C;
7710 			break;
7711 		case FETCH_OP_SAMPLE_L:
7712 			opcode = FETCH_OP_SAMPLE_C_L;
7713 			break;
7714 		case FETCH_OP_SAMPLE_LB:
7715 			opcode = FETCH_OP_SAMPLE_C_LB;
7716 			break;
7717 		case FETCH_OP_SAMPLE_G:
7718 			opcode = FETCH_OP_SAMPLE_C_G;
7719 			break;
7720 		/* Texture gather variants */
7721 		case FETCH_OP_GATHER4:
7722 			opcode = FETCH_OP_GATHER4_C;
7723 			break;
7724 		case FETCH_OP_GATHER4_O:
7725 			opcode = FETCH_OP_GATHER4_C_O;
7726 			break;
7727 		}
7728 	}
7729 
7730 	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7731 	tex.op = opcode;
7732 
7733 	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7734 	tex.sampler_index_mode = sampler_index_mode;
7735 	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7736 	tex.resource_index_mode = sampler_index_mode;
7737 	tex.src_gpr = src_gpr;
7738 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7739 
7740 	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7741 		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7742 		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7743 	}
7744 
7745 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7746 		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7747 		tex.inst_mod = texture_component_select;
7748 
7749 		if (ctx->bc->chip_class == CAYMAN) {
7750 		/* GATHER4 result order is different from TGSI TG4 */
7751 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7752 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7753 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7754 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7755 		} else {
7756 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7757 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7758 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7759 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7760 		}
7761 	}
7762 	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7763 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7764 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7765 		tex.dst_sel_z = 7;
7766 		tex.dst_sel_w = 7;
7767 	}
7768 	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7769 		tex.dst_sel_x = 3;
7770 		tex.dst_sel_y = 7;
7771 		tex.dst_sel_z = 7;
7772 		tex.dst_sel_w = 7;
7773 	}
7774 	else {
7775 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7776 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7777 		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7778 		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7779 	}
7780 
7781 
7782 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7783 		tex.src_sel_x = 4;
7784 		tex.src_sel_y = 4;
7785 		tex.src_sel_z = 4;
7786 		tex.src_sel_w = 4;
7787 	} else if (src_loaded) {
7788 		tex.src_sel_x = 0;
7789 		tex.src_sel_y = 1;
7790 		tex.src_sel_z = 2;
7791 		tex.src_sel_w = 3;
7792 	} else {
7793 		tex.src_sel_x = ctx->src[0].swizzle[0];
7794 		tex.src_sel_y = ctx->src[0].swizzle[1];
7795 		tex.src_sel_z = ctx->src[0].swizzle[2];
7796 		tex.src_sel_w = ctx->src[0].swizzle[3];
7797 		tex.src_rel = ctx->src[0].rel;
7798 	}
7799 
7800 	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7801 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7802 	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7803 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7804 		tex.src_sel_x = 1;
7805 		tex.src_sel_y = 0;
7806 		tex.src_sel_z = 3;
7807 		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7808 	}
7809 
7810 	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7811 	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7812 		tex.coord_type_x = 1;
7813 		tex.coord_type_y = 1;
7814 	}
7815 	tex.coord_type_z = 1;
7816 	tex.coord_type_w = 1;
7817 
7818 	tex.offset_x = offset_x;
7819 	tex.offset_y = offset_y;
7820 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7821 		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7822 		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7823 		tex.offset_z = 0;
7824 	}
7825 	else {
7826 		tex.offset_z = offset_z;
7827 	}
7828 
7829 	/* Put the depth for comparison in W.
7830 	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7831 	 * Some instructions expect the depth in Z. */
7832 	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7833 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7834 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7835 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7836 	    opcode != FETCH_OP_SAMPLE_C_L &&
7837 	    opcode != FETCH_OP_SAMPLE_C_LB) {
7838 		tex.src_sel_w = tex.src_sel_z;
7839 	}
7840 
7841 	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7842 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7843 		if (opcode == FETCH_OP_SAMPLE_C_L ||
7844 		    opcode == FETCH_OP_SAMPLE_C_LB) {
7845 			/* the array index is read from Y */
7846 			tex.coord_type_y = 0;
7847 		} else {
7848 			/* the array index is read from Z */
7849 			tex.coord_type_z = 0;
7850 			tex.src_sel_z = tex.src_sel_y;
7851 		}
7852 	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7853 		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7854 		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7855 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7856 		    (ctx->bc->chip_class >= EVERGREEN)))
7857 		/* the array index is read from Z */
7858 		tex.coord_type_z = 0;
7859 
7860 	/* mask unused source components */
7861 	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7862 		switch (inst->Texture.Texture) {
7863 		case TGSI_TEXTURE_2D:
7864 		case TGSI_TEXTURE_RECT:
7865 			tex.src_sel_z = 7;
7866 			tex.src_sel_w = 7;
7867 			break;
7868 		case TGSI_TEXTURE_1D_ARRAY:
7869 			tex.src_sel_y = 7;
7870 			tex.src_sel_w = 7;
7871 			break;
7872 		case TGSI_TEXTURE_1D:
7873 			tex.src_sel_y = 7;
7874 			tex.src_sel_z = 7;
7875 			tex.src_sel_w = 7;
7876 			break;
7877 		}
7878 	}
7879 
7880 	r = r600_bytecode_add_tex(ctx->bc, &tex);
7881 	if (r)
7882 		return r;
7883 
7884 	/* add shadow ambient support  - gallium doesn't do it yet */
7885 	return 0;
7886 }
7887 
find_hw_atomic_counter(struct r600_shader_ctx * ctx,struct tgsi_full_src_register * src)7888 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
7889 				  struct tgsi_full_src_register *src)
7890 {
7891 	unsigned i;
7892 
7893 	if (src->Register.Indirect) {
7894 		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7895 			if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
7896 				return ctx->shader->atomics[i].hw_idx;
7897 		}
7898 	} else {
7899 		uint32_t index = src->Register.Index;
7900 		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7901 			if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
7902 				continue;
7903 			if (index > ctx->shader->atomics[i].end)
7904 				continue;
7905 			if (index < ctx->shader->atomics[i].start)
7906 				continue;
7907 			uint32_t offset = (index - ctx->shader->atomics[i].start);
7908 			return ctx->shader->atomics[i].hw_idx + offset;
7909 		}
7910 	}
7911 	assert(0);
7912 	return -1;
7913 }
7914 
tgsi_set_gds_temp(struct r600_shader_ctx * ctx,int * uav_id_p,int * uav_index_mode_p)7915 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
7916 			     int *uav_id_p, int *uav_index_mode_p)
7917 {
7918 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7919 	int uav_id, uav_index_mode = 0;
7920 	int r;
7921 	bool is_cm = (ctx->bc->chip_class == CAYMAN);
7922 
7923 	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
7924 
7925 	if (inst->Src[0].Register.Indirect) {
7926 		if (is_cm) {
7927 			struct r600_bytecode_alu alu;
7928 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7929 			alu.op = ALU_OP2_LSHL_INT;
7930 			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
7931 			alu.src[0].chan = 0;
7932 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7933 			alu.src[1].value = 2;
7934 			alu.dst.sel = ctx->temp_reg;
7935 			alu.dst.chan = 0;
7936 			alu.dst.write = 1;
7937 			alu.last = 1;
7938 			r = r600_bytecode_add_alu(ctx->bc, &alu);
7939 			if (r)
7940 				return r;
7941 
7942 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
7943 					   ctx->temp_reg, 0,
7944 					   ctx->temp_reg, 0,
7945 					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
7946 			if (r)
7947 				return r;
7948 		} else
7949 			uav_index_mode = 2;
7950 	} else if (is_cm) {
7951 		r = single_alu_op2(ctx, ALU_OP1_MOV,
7952 				   ctx->temp_reg, 0,
7953 				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
7954 				   0, 0);
7955 		if (r)
7956 			return r;
7957 	}
7958 	*uav_id_p = uav_id;
7959 	*uav_index_mode_p = uav_index_mode;
7960 	return 0;
7961 }
7962 
tgsi_load_gds(struct r600_shader_ctx * ctx)7963 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
7964 {
7965 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7966 	int r;
7967 	struct r600_bytecode_gds gds;
7968 	int uav_id = 0;
7969 	int uav_index_mode = 0;
7970 	bool is_cm = (ctx->bc->chip_class == CAYMAN);
7971 
7972 	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
7973 	if (r)
7974 		return r;
7975 
7976 	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
7977 	gds.op = FETCH_OP_GDS_READ_RET;
7978 	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7979 	gds.uav_id = is_cm ? 0 : uav_id;
7980 	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
7981 	gds.src_gpr = ctx->temp_reg;
7982 	gds.src_sel_x = (is_cm) ? 0 : 4;
7983 	gds.src_sel_y = 4;
7984 	gds.src_sel_z = 4;
7985 	gds.dst_sel_x = 0;
7986 	gds.dst_sel_y = 7;
7987 	gds.dst_sel_z = 7;
7988 	gds.dst_sel_w = 7;
7989 	gds.src_gpr2 = 0;
7990 	gds.alloc_consume = !is_cm;
7991 	r = r600_bytecode_add_gds(ctx->bc, &gds);
7992 	if (r)
7993 		return r;
7994 
7995 	ctx->bc->cf_last->vpm = 1;
7996 	return 0;
7997 }
7998 
7999 /* this fixes up 1D arrays properly */
load_index_src(struct r600_shader_ctx * ctx,int src_index,int * idx_gpr)8000 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8001 {
8002 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8003 	int r, i;
8004 	struct r600_bytecode_alu alu;
8005 	int temp_reg = r600_get_temp(ctx);
8006 
8007 	for (i = 0; i < 4; i++) {
8008 		bool def_val = true, write_zero = false;
8009 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8010 		alu.op = ALU_OP1_MOV;
8011 		alu.dst.sel = temp_reg;
8012 		alu.dst.chan = i;
8013 
8014 		switch (inst->Memory.Texture) {
8015 		case TGSI_TEXTURE_BUFFER:
8016 		case TGSI_TEXTURE_1D:
8017 			if (i == 1 || i == 2 || i == 3) {
8018 				write_zero = true;
8019 			}
8020 			break;
8021 		case TGSI_TEXTURE_1D_ARRAY:
8022 			if (i == 1 || i == 3)
8023 				write_zero = true;
8024 			else if (i == 2) {
8025 				r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8026 				def_val = false;
8027 			}
8028 			break;
8029 		case TGSI_TEXTURE_2D:
8030 			if (i == 2 || i == 3)
8031 				write_zero = true;
8032 			break;
8033 		default:
8034 			if (i == 3)
8035 				write_zero = true;
8036 			break;
8037 		}
8038 
8039 		if (write_zero) {
8040 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8041 			alu.src[0].value = 0;
8042 		} else if (def_val) {
8043 			r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8044 		}
8045 
8046 		if (i == 3)
8047 			alu.last = 1;
8048 		alu.dst.write = 1;
8049 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8050 		if (r)
8051 			return r;
8052 	}
8053 	*idx_gpr = temp_reg;
8054 	return 0;
8055 }
8056 
load_buffer_coord(struct r600_shader_ctx * ctx,int src_idx,int temp_reg)8057 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8058 			     int temp_reg)
8059 {
8060 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8061 	int r;
8062 	if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8063 		int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8064 		r = single_alu_op2(ctx, ALU_OP1_MOV,
8065 				   temp_reg, 0,
8066 				   V_SQ_ALU_SRC_LITERAL, value >> 2,
8067 				   0, 0);
8068 		if (r)
8069 			return r;
8070 	} else {
8071 		struct r600_bytecode_alu alu;
8072 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8073 		alu.op = ALU_OP2_LSHR_INT;
8074 		r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8075 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8076 		alu.src[1].value = 2;
8077 		alu.dst.sel = temp_reg;
8078 		alu.dst.write = 1;
8079 		alu.last = 1;
8080 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8081 		if (r)
8082 			return r;
8083 	}
8084 	return 0;
8085 }
8086 
tgsi_load_buffer(struct r600_shader_ctx * ctx)8087 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8088 {
8089 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8090 	/* have to work out the offset into the RAT immediate return buffer */
8091 	struct r600_bytecode_vtx vtx;
8092 	struct r600_bytecode_cf *cf;
8093 	int r;
8094 	int temp_reg = r600_get_temp(ctx);
8095 	unsigned rat_index_mode;
8096 	unsigned base;
8097 
8098 	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8099 	base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8100 
8101 	r = load_buffer_coord(ctx, 1, temp_reg);
8102 	if (r)
8103 		return r;
8104 	ctx->bc->cf_last->barrier = 1;
8105 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8106 	vtx.op = FETCH_OP_VFETCH;
8107 	vtx.buffer_id = inst->Src[0].Register.Index + base;
8108 	vtx.buffer_index_mode = rat_index_mode;
8109 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8110 	vtx.src_gpr = temp_reg;
8111 	vtx.src_sel_x = 0;
8112 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8113 	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
8114 	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
8115 	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
8116 	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
8117 	vtx.num_format_all = 1;
8118 	vtx.format_comp_all = 1;
8119 	vtx.srf_mode_all = 0;
8120 
8121 	if (inst->Dst[0].Register.WriteMask & 8) {
8122 		vtx.data_format = FMT_32_32_32_32;
8123 		vtx.use_const_fields = 0;
8124 	} else if (inst->Dst[0].Register.WriteMask & 4) {
8125 		vtx.data_format = FMT_32_32_32;
8126 		vtx.use_const_fields = 0;
8127 	} else if (inst->Dst[0].Register.WriteMask & 2) {
8128 		vtx.data_format = FMT_32_32;
8129 		vtx.use_const_fields = 0;
8130 	} else {
8131 		vtx.data_format = FMT_32;
8132 		vtx.use_const_fields = 0;
8133 	}
8134 
8135 	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8136 	if (r)
8137 		return r;
8138 	cf = ctx->bc->cf_last;
8139 	cf->barrier = 1;
8140 	return 0;
8141 }
8142 
tgsi_load_rat(struct r600_shader_ctx * ctx)8143 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8144 {
8145 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8146 	/* have to work out the offset into the RAT immediate return buffer */
8147 	struct r600_bytecode_vtx vtx;
8148 	struct r600_bytecode_cf *cf;
8149 	int r;
8150 	int idx_gpr;
8151 	unsigned format, num_format, format_comp, endian;
8152 	const struct util_format_description *desc;
8153 	unsigned rat_index_mode;
8154 	unsigned immed_base;
8155 
8156 	r = load_thread_id_gpr(ctx);
8157 	if (r)
8158 		return r;
8159 
8160 	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8161 
8162 	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8163 	r = load_index_src(ctx, 1, &idx_gpr);
8164 	if (r)
8165 		return r;
8166 
8167 	if (rat_index_mode)
8168 		egcm_load_index_reg(ctx->bc, 1, false);
8169 
8170 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8171 	cf = ctx->bc->cf_last;
8172 
8173 	cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8174 	cf->rat.inst = V_RAT_INST_NOP_RTN;
8175 	cf->rat.index_mode = rat_index_mode;
8176 	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8177 	cf->output.gpr = ctx->thread_id_gpr;
8178 	cf->output.index_gpr = idx_gpr;
8179 	cf->output.comp_mask = 0xf;
8180 	cf->output.burst_count = 1;
8181 	cf->vpm = 1;
8182 	cf->barrier = 1;
8183 	cf->mark = 1;
8184 	cf->output.elem_size = 0;
8185 
8186 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8187 	cf = ctx->bc->cf_last;
8188 	cf->barrier = 1;
8189 
8190 	desc = util_format_description(inst->Memory.Format);
8191 	r600_vertex_data_type(inst->Memory.Format,
8192 			      &format, &num_format, &format_comp, &endian);
8193 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8194 	vtx.op = FETCH_OP_VFETCH;
8195 	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8196 	vtx.buffer_index_mode = rat_index_mode;
8197 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8198 	vtx.src_gpr = ctx->thread_id_gpr;
8199 	vtx.src_sel_x = 1;
8200 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8201 	vtx.dst_sel_x = desc->swizzle[0];
8202 	vtx.dst_sel_y = desc->swizzle[1];
8203 	vtx.dst_sel_z = desc->swizzle[2];
8204 	vtx.dst_sel_w = desc->swizzle[3];
8205 	vtx.srf_mode_all = 1;
8206 	vtx.data_format = format;
8207 	vtx.num_format_all = num_format;
8208 	vtx.format_comp_all = format_comp;
8209 	vtx.endian = endian;
8210 	vtx.offset = 0;
8211 	vtx.mega_fetch_count = 3;
8212 	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8213 	if (r)
8214 		return r;
8215 	cf = ctx->bc->cf_last;
8216 	cf->barrier = 1;
8217 	return 0;
8218 }
8219 
tgsi_load_lds(struct r600_shader_ctx * ctx)8220 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8221 {
8222 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8223 	struct r600_bytecode_alu alu;
8224 	int r;
8225 	int temp_reg = r600_get_temp(ctx);
8226 
8227 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8228 	alu.op = ALU_OP1_MOV;
8229 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8230 	alu.dst.sel = temp_reg;
8231 	alu.dst.write = 1;
8232 	alu.last = 1;
8233 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8234 	if (r)
8235 		return r;
8236 
8237 	r = do_lds_fetch_values(ctx, temp_reg,
8238 				ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8239 	if (r)
8240 		return r;
8241 	return 0;
8242 }
8243 
tgsi_load(struct r600_shader_ctx * ctx)8244 static int tgsi_load(struct r600_shader_ctx *ctx)
8245 {
8246 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8247 	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8248 		return tgsi_load_rat(ctx);
8249 	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8250 		return tgsi_load_gds(ctx);
8251 	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8252 		return tgsi_load_buffer(ctx);
8253 	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8254 		return tgsi_load_lds(ctx);
8255 	return 0;
8256 }
8257 
tgsi_store_buffer_rat(struct r600_shader_ctx * ctx)8258 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8259 {
8260 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8261 	struct r600_bytecode_cf *cf;
8262 	int r, i;
8263 	unsigned rat_index_mode;
8264 	int lasti;
8265 	int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8266 
8267 	r = load_buffer_coord(ctx, 0, treg2);
8268 	if (r)
8269 		return r;
8270 
8271 	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8272 	if (rat_index_mode)
8273 		egcm_load_index_reg(ctx->bc, 1, false);
8274 
8275 	for (i = 0; i <= 3; i++) {
8276 		struct r600_bytecode_alu alu;
8277 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8278 		alu.op = ALU_OP1_MOV;
8279 		alu.dst.sel = temp_reg;
8280 		alu.dst.chan = i;
8281 		alu.src[0].sel = V_SQ_ALU_SRC_0;
8282 		alu.last = (i == 3);
8283 		alu.dst.write = 1;
8284 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8285 		if (r)
8286 			return r;
8287 	}
8288 
8289 	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8290 	for (i = 0; i <= lasti; i++) {
8291 		struct r600_bytecode_alu alu;
8292 		if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8293 			continue;
8294 
8295 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8296 				   temp_reg, 0,
8297 				   treg2, 0,
8298 				   V_SQ_ALU_SRC_LITERAL, i);
8299 		if (r)
8300 			return r;
8301 
8302 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8303 		alu.op = ALU_OP1_MOV;
8304 		alu.dst.sel = ctx->temp_reg;
8305 		alu.dst.chan = 0;
8306 
8307 		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8308 		alu.last = 1;
8309 		alu.dst.write = 1;
8310 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8311 		if (r)
8312 			return r;
8313 
8314 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8315 		cf = ctx->bc->cf_last;
8316 
8317 		cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8318 		cf->rat.inst = V_RAT_INST_STORE_TYPED;
8319 		cf->rat.index_mode = rat_index_mode;
8320 		cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8321 		cf->output.gpr = ctx->temp_reg;
8322 		cf->output.index_gpr = temp_reg;
8323 		cf->output.comp_mask = 1;
8324 		cf->output.burst_count = 1;
8325 		cf->vpm = 1;
8326 		cf->barrier = 1;
8327 		cf->output.elem_size = 0;
8328 	}
8329 	return 0;
8330 }
8331 
tgsi_store_rat(struct r600_shader_ctx * ctx)8332 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8333 {
8334 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8335 	struct r600_bytecode_cf *cf;
8336 	bool src_requires_loading = false;
8337 	int val_gpr, idx_gpr;
8338 	int r, i;
8339 	unsigned rat_index_mode;
8340 
8341 	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8342 
8343 	r = load_index_src(ctx, 0, &idx_gpr);
8344 	if (r)
8345 		return r;
8346 
8347 	if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8348 		src_requires_loading = true;
8349 
8350 	if (src_requires_loading) {
8351 		struct r600_bytecode_alu alu;
8352 		for (i = 0; i < 4; i++) {
8353 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8354 			alu.op = ALU_OP1_MOV;
8355 			alu.dst.sel = ctx->temp_reg;
8356 			alu.dst.chan = i;
8357 
8358 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8359 			if (i == 3)
8360 				alu.last = 1;
8361 			alu.dst.write = 1;
8362 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8363 			if (r)
8364 				return r;
8365 		}
8366 		val_gpr = ctx->temp_reg;
8367 	} else
8368 		val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8369 	if (rat_index_mode)
8370 		egcm_load_index_reg(ctx->bc, 1, false);
8371 
8372 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8373 	cf = ctx->bc->cf_last;
8374 
8375 	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8376 	cf->rat.inst = V_RAT_INST_STORE_TYPED;
8377 	cf->rat.index_mode = rat_index_mode;
8378 	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8379 	cf->output.gpr = val_gpr;
8380 	cf->output.index_gpr = idx_gpr;
8381 	cf->output.comp_mask = 0xf;
8382 	cf->output.burst_count = 1;
8383 	cf->vpm = 1;
8384 	cf->barrier = 1;
8385 	cf->output.elem_size = 0;
8386 	return 0;
8387 }
8388 
tgsi_store_lds(struct r600_shader_ctx * ctx)8389 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
8390 {
8391 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8392 	struct r600_bytecode_alu alu;
8393 	int r, i, lasti;
8394 	int write_mask = inst->Dst[0].Register.WriteMask;
8395 	int temp_reg = r600_get_temp(ctx);
8396 
8397 	/* LDS write */
8398 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8399 	alu.op = ALU_OP1_MOV;
8400 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8401 	alu.dst.sel = temp_reg;
8402 	alu.dst.write = 1;
8403 	alu.last = 1;
8404 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8405 	if (r)
8406 		return r;
8407 
8408 	lasti = tgsi_last_instruction(write_mask);
8409 	for (i = 1; i <= lasti; i++) {
8410 		if (!(write_mask & (1 << i)))
8411 			continue;
8412 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8413 				   temp_reg, i,
8414 				   temp_reg, 0,
8415 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
8416 		if (r)
8417 			return r;
8418 	}
8419 	for (i = 0; i <= lasti; i++) {
8420 		if (!(write_mask & (1 << i)))
8421 			continue;
8422 
8423 		if ((i == 0 && ((write_mask & 3) == 3)) ||
8424 		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
8425 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8426 			alu.op = LDS_OP3_LDS_WRITE_REL;
8427 
8428 			alu.src[0].sel = temp_reg;
8429 			alu.src[0].chan = i;
8430 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8431 			r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
8432 			alu.last = 1;
8433 			alu.is_lds_idx_op = true;
8434 			alu.lds_idx = 1;
8435 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8436 			if (r)
8437 				return r;
8438 			i += 1;
8439 			continue;
8440 		}
8441 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8442 		alu.op = LDS_OP2_LDS_WRITE;
8443 
8444 		alu.src[0].sel = temp_reg;
8445 		alu.src[0].chan = i;
8446 		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8447 
8448 		alu.last = 1;
8449 		alu.is_lds_idx_op = true;
8450 
8451 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8452 		if (r)
8453 			return r;
8454 	}
8455 	return 0;
8456 }
8457 
tgsi_store(struct r600_shader_ctx * ctx)8458 static int tgsi_store(struct r600_shader_ctx *ctx)
8459 {
8460 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8461 	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
8462 		return tgsi_store_buffer_rat(ctx);
8463 	else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
8464 		return tgsi_store_lds(ctx);
8465 	else
8466 		return tgsi_store_rat(ctx);
8467 }
8468 
tgsi_atomic_op_rat(struct r600_shader_ctx * ctx)8469 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8470 {
8471 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8472 	/* have to work out the offset into the RAT immediate return buffer */
8473 	struct r600_bytecode_alu alu;
8474 	struct r600_bytecode_vtx vtx;
8475 	struct r600_bytecode_cf *cf;
8476 	int r;
8477 	int idx_gpr;
8478 	unsigned format, num_format, format_comp, endian;
8479 	const struct util_format_description *desc;
8480 	unsigned rat_index_mode;
8481 	unsigned immed_base;
8482 	unsigned rat_base;
8483 
8484 	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8485 	rat_base = ctx->shader->rat_base;
8486 
8487 	r = load_thread_id_gpr(ctx);
8488 	if (r)
8489 		return r;
8490 
8491         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
8492 		immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8493 		rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8494 
8495 		r = load_buffer_coord(ctx, 1, ctx->temp_reg);
8496 		if (r)
8497 			return r;
8498 		idx_gpr = ctx->temp_reg;
8499 	} else {
8500 		r = load_index_src(ctx, 1, &idx_gpr);
8501 		if (r)
8502 			return r;
8503 	}
8504 
8505 	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8506 
8507 	if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8508 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8509 		alu.op = ALU_OP1_MOV;
8510 		alu.dst.sel = ctx->thread_id_gpr;
8511 		alu.dst.chan = 0;
8512 		alu.dst.write = 1;
8513 		r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8514 		alu.last = 1;
8515 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8516 		if (r)
8517 			return r;
8518 
8519 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8520 		alu.op = ALU_OP1_MOV;
8521 		alu.dst.sel = ctx->thread_id_gpr;
8522 		if (ctx->bc->chip_class == CAYMAN)
8523 			alu.dst.chan = 2;
8524 		else
8525 			alu.dst.chan = 3;
8526 		alu.dst.write = 1;
8527 		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8528 		alu.last = 1;
8529 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8530 		if (r)
8531 			return r;
8532 	} else {
8533 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8534 		alu.op = ALU_OP1_MOV;
8535 		alu.dst.sel = ctx->thread_id_gpr;
8536 		alu.dst.chan = 0;
8537 		alu.dst.write = 1;
8538 		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8539 		alu.last = 1;
8540 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8541 		if (r)
8542 			return r;
8543 	}
8544 
8545 	if (rat_index_mode)
8546 		egcm_load_index_reg(ctx->bc, 1, false);
8547 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8548 	cf = ctx->bc->cf_last;
8549 
8550 	cf->rat.id = rat_base + inst->Src[0].Register.Index;
8551 	cf->rat.inst = ctx->inst_info->op;
8552 	cf->rat.index_mode = rat_index_mode;
8553 	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8554 	cf->output.gpr = ctx->thread_id_gpr;
8555 	cf->output.index_gpr = idx_gpr;
8556 	cf->output.comp_mask = 0xf;
8557 	cf->output.burst_count = 1;
8558 	cf->vpm = 1;
8559 	cf->barrier = 1;
8560 	cf->mark = 1;
8561 	cf->output.elem_size = 0;
8562 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8563 	cf = ctx->bc->cf_last;
8564 	cf->barrier = 1;
8565 	cf->cf_addr = 1;
8566 
8567 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8568 	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
8569 		desc = util_format_description(inst->Memory.Format);
8570 		r600_vertex_data_type(inst->Memory.Format,
8571 				      &format, &num_format, &format_comp, &endian);
8572 		vtx.dst_sel_x = desc->swizzle[0];
8573 	} else {
8574 		format = FMT_32;
8575 		num_format = 1;
8576 		format_comp = 0;
8577 		endian = 0;
8578 		vtx.dst_sel_x = 0;
8579 	}
8580 	vtx.op = FETCH_OP_VFETCH;
8581 	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8582 	vtx.buffer_index_mode = rat_index_mode;
8583 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8584 	vtx.src_gpr = ctx->thread_id_gpr;
8585 	vtx.src_sel_x = 1;
8586 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8587 	vtx.dst_sel_y = 7;
8588 	vtx.dst_sel_z = 7;
8589 	vtx.dst_sel_w = 7;
8590 	vtx.use_const_fields = 0;
8591 	vtx.srf_mode_all = 1;
8592 	vtx.data_format = format;
8593 	vtx.num_format_all = num_format;
8594 	vtx.format_comp_all = format_comp;
8595 	vtx.endian = endian;
8596 	vtx.offset = 0;
8597 	vtx.mega_fetch_count = 0xf;
8598 	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8599 	if (r)
8600 		return r;
8601 	cf = ctx->bc->cf_last;
8602 	cf->vpm = 1;
8603 	cf->barrier = 1;
8604 	return 0;
8605 }
8606 
get_gds_op(int opcode)8607 static int get_gds_op(int opcode)
8608 {
8609 	switch (opcode) {
8610 	case TGSI_OPCODE_ATOMUADD:
8611 		return FETCH_OP_GDS_ADD_RET;
8612 	case TGSI_OPCODE_ATOMAND:
8613 		return FETCH_OP_GDS_AND_RET;
8614 	case TGSI_OPCODE_ATOMOR:
8615 		return FETCH_OP_GDS_OR_RET;
8616 	case TGSI_OPCODE_ATOMXOR:
8617 		return FETCH_OP_GDS_XOR_RET;
8618 	case TGSI_OPCODE_ATOMUMIN:
8619 		return FETCH_OP_GDS_MIN_UINT_RET;
8620 	case TGSI_OPCODE_ATOMUMAX:
8621 		return FETCH_OP_GDS_MAX_UINT_RET;
8622 	case TGSI_OPCODE_ATOMXCHG:
8623 		return FETCH_OP_GDS_XCHG_RET;
8624 	case TGSI_OPCODE_ATOMCAS:
8625 		return FETCH_OP_GDS_CMP_XCHG_RET;
8626 	default:
8627 		return -1;
8628 	}
8629 }
8630 
tgsi_atomic_op_gds(struct r600_shader_ctx * ctx)8631 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8632 {
8633 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8634 	struct r600_bytecode_gds gds;
8635 	struct r600_bytecode_alu alu;
8636 	int gds_op = get_gds_op(inst->Instruction.Opcode);
8637 	int r;
8638 	int uav_id = 0;
8639 	int uav_index_mode = 0;
8640 	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8641 
8642 	if (gds_op == -1) {
8643 		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8644 		return -1;
8645 	}
8646 
8647 	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8648 	if (r)
8649 		return r;
8650 
8651 	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8652 		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8653 		int abs_value = abs(value);
8654 		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8655 			gds_op = FETCH_OP_GDS_SUB_RET;
8656 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8657 		alu.op = ALU_OP1_MOV;
8658 		alu.dst.sel = ctx->temp_reg;
8659 		alu.dst.chan = is_cm ? 1 : 0;
8660 		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8661 		alu.src[0].value = abs_value;
8662 		alu.last = 1;
8663 		alu.dst.write = 1;
8664 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8665 		if (r)
8666 			return r;
8667 	} else {
8668 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8669 		alu.op = ALU_OP1_MOV;
8670 		alu.dst.sel = ctx->temp_reg;
8671 		alu.dst.chan = is_cm ? 1 : 0;
8672 		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8673 		alu.last = 1;
8674 		alu.dst.write = 1;
8675 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8676 		if (r)
8677 			return r;
8678 	}
8679 
8680 
8681 	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8682 	gds.op = gds_op;
8683 	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8684 	gds.uav_id = is_cm ? 0 : uav_id;
8685 	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8686 	gds.src_gpr = ctx->temp_reg;
8687 	gds.src_gpr2 = 0;
8688 	gds.src_sel_x = is_cm ? 0 : 4;
8689 	gds.src_sel_y = is_cm ? 1 : 0;
8690 	gds.src_sel_z = 7;
8691 	gds.dst_sel_x = 0;
8692 	gds.dst_sel_y = 7;
8693 	gds.dst_sel_z = 7;
8694 	gds.dst_sel_w = 7;
8695 	gds.alloc_consume = !is_cm;
8696 
8697 	r = r600_bytecode_add_gds(ctx->bc, &gds);
8698 	if (r)
8699 		return r;
8700 	ctx->bc->cf_last->vpm = 1;
8701 	return 0;
8702 }
8703 
get_lds_op(int opcode)8704 static int get_lds_op(int opcode)
8705 {
8706 	switch (opcode) {
8707 	case TGSI_OPCODE_ATOMUADD:
8708 		return LDS_OP2_LDS_ADD_RET;
8709 	case TGSI_OPCODE_ATOMAND:
8710 		return LDS_OP2_LDS_AND_RET;
8711 	case TGSI_OPCODE_ATOMOR:
8712 		return LDS_OP2_LDS_OR_RET;
8713 	case TGSI_OPCODE_ATOMXOR:
8714 		return LDS_OP2_LDS_XOR_RET;
8715 	case TGSI_OPCODE_ATOMUMIN:
8716 		return LDS_OP2_LDS_MIN_UINT_RET;
8717 	case TGSI_OPCODE_ATOMUMAX:
8718 		return LDS_OP2_LDS_MAX_UINT_RET;
8719 	case TGSI_OPCODE_ATOMIMIN:
8720 		return LDS_OP2_LDS_MIN_INT_RET;
8721 	case TGSI_OPCODE_ATOMIMAX:
8722 		return LDS_OP2_LDS_MAX_INT_RET;
8723 	case TGSI_OPCODE_ATOMXCHG:
8724 		return LDS_OP2_LDS_XCHG_RET;
8725 	case TGSI_OPCODE_ATOMCAS:
8726 		return LDS_OP3_LDS_CMP_XCHG_RET;
8727 	default:
8728 		return -1;
8729 	}
8730 }
8731 
tgsi_atomic_op_lds(struct r600_shader_ctx * ctx)8732 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
8733 {
8734 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8735 	int lds_op = get_lds_op(inst->Instruction.Opcode);
8736 	int r;
8737 
8738 	struct r600_bytecode_alu alu;
8739 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8740 	alu.op = lds_op;
8741 	alu.is_lds_idx_op = true;
8742 	alu.last = 1;
8743 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8744 	r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
8745 	if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
8746 		r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
8747 	else
8748 		alu.src[2].sel = V_SQ_ALU_SRC_0;
8749 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8750 	if (r)
8751 		return r;
8752 
8753 	/* then read from LDS_OQ_A_POP */
8754 	memset(&alu, 0, sizeof(alu));
8755 
8756 	alu.op = ALU_OP1_MOV;
8757 	alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
8758 	alu.src[0].chan = 0;
8759 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
8760 	alu.dst.write = 1;
8761 	alu.last = 1;
8762 	r = r600_bytecode_add_alu(ctx->bc, &alu);
8763 	if (r)
8764 		return r;
8765 
8766 	return 0;
8767 }
8768 
tgsi_atomic_op(struct r600_shader_ctx * ctx)8769 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
8770 {
8771 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8772 	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8773 		return tgsi_atomic_op_rat(ctx);
8774 	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8775 		return tgsi_atomic_op_gds(ctx);
8776 	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8777 		return tgsi_atomic_op_rat(ctx);
8778 	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8779 		return tgsi_atomic_op_lds(ctx);
8780 	return 0;
8781 }
8782 
tgsi_resq(struct r600_shader_ctx * ctx)8783 static int tgsi_resq(struct r600_shader_ctx *ctx)
8784 {
8785 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8786 	unsigned sampler_index_mode;
8787 	struct r600_bytecode_tex tex;
8788 	int r;
8789 	boolean has_txq_cube_array_z = false;
8790 
8791 	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
8792 	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
8793 		if (ctx->bc->chip_class < EVERGREEN)
8794 			ctx->shader->uses_tex_buffers = true;
8795 		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
8796 	}
8797 
8798 	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
8799 	    inst->Dst[0].Register.WriteMask & 4) {
8800 		ctx->shader->has_txq_cube_array_z_comp = true;
8801 		has_txq_cube_array_z = true;
8802 	}
8803 
8804 	sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8805 	if (sampler_index_mode)
8806 		egcm_load_index_reg(ctx->bc, 1, false);
8807 
8808 
8809 	/* does this shader want a num layers from TXQ for a cube array? */
8810 	if (has_txq_cube_array_z) {
8811 		int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
8812 		struct r600_bytecode_alu alu;
8813 
8814 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8815 		alu.op = ALU_OP1_MOV;
8816 
8817 		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8818 		/* with eg each dword is either number of cubes */
8819 		alu.src[0].sel += id / 4;
8820 		alu.src[0].chan = id % 4;
8821 		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8822 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8823 		alu.last = 1;
8824 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8825 		if (r)
8826 			return r;
8827 		/* disable writemask from texture instruction */
8828 		inst->Dst[0].Register.WriteMask &= ~4;
8829 	}
8830 	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8831 	tex.op = ctx->inst_info->op;
8832 	tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
8833 	tex.sampler_index_mode = sampler_index_mode;
8834 	tex.resource_id = tex.sampler_id;
8835 	tex.resource_index_mode = sampler_index_mode;
8836 	tex.src_sel_x = 4;
8837 	tex.src_sel_y = 4;
8838 	tex.src_sel_z = 4;
8839 	tex.src_sel_w = 4;
8840 	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8841 	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8842 	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8843 	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8844 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8845 	r = r600_bytecode_add_tex(ctx->bc, &tex);
8846 	if (r)
8847 		return r;
8848 
8849 	return 0;
8850 }
8851 
tgsi_lrp(struct r600_shader_ctx * ctx)8852 static int tgsi_lrp(struct r600_shader_ctx *ctx)
8853 {
8854 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8855 	struct r600_bytecode_alu alu;
8856 	unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8857 	unsigned i, temp_regs[2];
8858 	int r;
8859 
8860 	/* optimize if it's just an equal balance */
8861 	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
8862 		for (i = 0; i < lasti + 1; i++) {
8863 			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8864 				continue;
8865 
8866 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8867 			alu.op = ALU_OP2_ADD;
8868 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8869 			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8870 			alu.omod = 3;
8871 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8872 			alu.dst.chan = i;
8873 			if (i == lasti) {
8874 				alu.last = 1;
8875 			}
8876 			r = r600_bytecode_add_alu(ctx->bc, &alu);
8877 			if (r)
8878 				return r;
8879 		}
8880 		return 0;
8881 	}
8882 
8883 	/* 1 - src0 */
8884 	for (i = 0; i < lasti + 1; i++) {
8885 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8886 			continue;
8887 
8888 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8889 		alu.op = ALU_OP2_ADD;
8890 		alu.src[0].sel = V_SQ_ALU_SRC_1;
8891 		alu.src[0].chan = 0;
8892 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
8893 		r600_bytecode_src_toggle_neg(&alu.src[1]);
8894 		alu.dst.sel = ctx->temp_reg;
8895 		alu.dst.chan = i;
8896 		if (i == lasti) {
8897 			alu.last = 1;
8898 		}
8899 		alu.dst.write = 1;
8900 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8901 		if (r)
8902 			return r;
8903 	}
8904 
8905 	/* (1 - src0) * src2 */
8906 	for (i = 0; i < lasti + 1; i++) {
8907 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8908 			continue;
8909 
8910 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8911 		alu.op = ALU_OP2_MUL;
8912 		alu.src[0].sel = ctx->temp_reg;
8913 		alu.src[0].chan = i;
8914 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8915 		alu.dst.sel = ctx->temp_reg;
8916 		alu.dst.chan = i;
8917 		if (i == lasti) {
8918 			alu.last = 1;
8919 		}
8920 		alu.dst.write = 1;
8921 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8922 		if (r)
8923 			return r;
8924 	}
8925 
8926 	/* src0 * src1 + (1 - src0) * src2 */
8927         if (ctx->src[0].abs)
8928 		temp_regs[0] = r600_get_temp(ctx);
8929 	else
8930 		temp_regs[0] = 0;
8931 	if (ctx->src[1].abs)
8932 		temp_regs[1] = r600_get_temp(ctx);
8933 	else
8934 		temp_regs[1] = 0;
8935 
8936 	for (i = 0; i < lasti + 1; i++) {
8937 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8938 			continue;
8939 
8940 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8941 		alu.op = ALU_OP3_MULADD;
8942 		alu.is_op3 = 1;
8943 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8944 		if (r)
8945 			return r;
8946 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
8947 		if (r)
8948 			return r;
8949 		alu.src[2].sel = ctx->temp_reg;
8950 		alu.src[2].chan = i;
8951 
8952 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8953 		alu.dst.chan = i;
8954 		if (i == lasti) {
8955 			alu.last = 1;
8956 		}
8957 		r = r600_bytecode_add_alu(ctx->bc, &alu);
8958 		if (r)
8959 			return r;
8960 	}
8961 	return 0;
8962 }
8963 
tgsi_cmp(struct r600_shader_ctx * ctx)8964 static int tgsi_cmp(struct r600_shader_ctx *ctx)
8965 {
8966 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8967 	struct r600_bytecode_alu alu;
8968 	int i, r, j;
8969 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8970 	int temp_regs[3];
8971 	unsigned op;
8972 
8973 	if (ctx->src[0].abs && ctx->src[0].neg) {
8974 		op = ALU_OP3_CNDE;
8975 		ctx->src[0].abs = 0;
8976 		ctx->src[0].neg = 0;
8977 	} else {
8978 		op = ALU_OP3_CNDGE;
8979 	}
8980 
8981 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
8982 		temp_regs[j] = 0;
8983 		if (ctx->src[j].abs)
8984 			temp_regs[j] = r600_get_temp(ctx);
8985 	}
8986 
8987 	for (i = 0; i < lasti + 1; i++) {
8988 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8989 			continue;
8990 
8991 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8992 		alu.op = op;
8993 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8994 		if (r)
8995 			return r;
8996 		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
8997 		if (r)
8998 			return r;
8999 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
9000 		if (r)
9001 			return r;
9002 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9003 		alu.dst.chan = i;
9004 		alu.dst.write = 1;
9005 		alu.is_op3 = 1;
9006 		if (i == lasti)
9007 			alu.last = 1;
9008 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9009 		if (r)
9010 			return r;
9011 	}
9012 	return 0;
9013 }
9014 
tgsi_ucmp(struct r600_shader_ctx * ctx)9015 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9016 {
9017 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9018 	struct r600_bytecode_alu alu;
9019 	int i, r;
9020 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9021 
9022 	for (i = 0; i < lasti + 1; i++) {
9023 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9024 			continue;
9025 
9026 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9027 		alu.op = ALU_OP3_CNDE_INT;
9028 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9029 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9030 		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9031 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9032 		alu.dst.chan = i;
9033 		alu.dst.write = 1;
9034 		alu.is_op3 = 1;
9035 		if (i == lasti)
9036 			alu.last = 1;
9037 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9038 		if (r)
9039 			return r;
9040 	}
9041 	return 0;
9042 }
9043 
tgsi_exp(struct r600_shader_ctx * ctx)9044 static int tgsi_exp(struct r600_shader_ctx *ctx)
9045 {
9046 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9047 	struct r600_bytecode_alu alu;
9048 	int r;
9049 	unsigned i;
9050 
9051 	/* result.x = 2^floor(src); */
9052 	if (inst->Dst[0].Register.WriteMask & 1) {
9053 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9054 
9055 		alu.op = ALU_OP1_FLOOR;
9056 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9057 
9058 		alu.dst.sel = ctx->temp_reg;
9059 		alu.dst.chan = 0;
9060 		alu.dst.write = 1;
9061 		alu.last = 1;
9062 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9063 		if (r)
9064 			return r;
9065 
9066 		if (ctx->bc->chip_class == CAYMAN) {
9067 			for (i = 0; i < 3; i++) {
9068 				alu.op = ALU_OP1_EXP_IEEE;
9069 				alu.src[0].sel = ctx->temp_reg;
9070 				alu.src[0].chan = 0;
9071 
9072 				alu.dst.sel = ctx->temp_reg;
9073 				alu.dst.chan = i;
9074 				alu.dst.write = i == 0;
9075 				alu.last = i == 2;
9076 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9077 				if (r)
9078 					return r;
9079 			}
9080 		} else {
9081 			alu.op = ALU_OP1_EXP_IEEE;
9082 			alu.src[0].sel = ctx->temp_reg;
9083 			alu.src[0].chan = 0;
9084 
9085 			alu.dst.sel = ctx->temp_reg;
9086 			alu.dst.chan = 0;
9087 			alu.dst.write = 1;
9088 			alu.last = 1;
9089 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9090 			if (r)
9091 				return r;
9092 		}
9093 	}
9094 
9095 	/* result.y = tmp - floor(tmp); */
9096 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9097 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9098 
9099 		alu.op = ALU_OP1_FRACT;
9100 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9101 
9102 		alu.dst.sel = ctx->temp_reg;
9103 #if 0
9104 		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9105 		if (r)
9106 			return r;
9107 #endif
9108 		alu.dst.write = 1;
9109 		alu.dst.chan = 1;
9110 
9111 		alu.last = 1;
9112 
9113 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9114 		if (r)
9115 			return r;
9116 	}
9117 
9118 	/* result.z = RoughApprox2ToX(tmp);*/
9119 	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9120 		if (ctx->bc->chip_class == CAYMAN) {
9121 			for (i = 0; i < 3; i++) {
9122 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9123 				alu.op = ALU_OP1_EXP_IEEE;
9124 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9125 
9126 				alu.dst.sel = ctx->temp_reg;
9127 				alu.dst.chan = i;
9128 				if (i == 2) {
9129 					alu.dst.write = 1;
9130 					alu.last = 1;
9131 				}
9132 
9133 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9134 				if (r)
9135 					return r;
9136 			}
9137 		} else {
9138 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9139 			alu.op = ALU_OP1_EXP_IEEE;
9140 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9141 
9142 			alu.dst.sel = ctx->temp_reg;
9143 			alu.dst.write = 1;
9144 			alu.dst.chan = 2;
9145 
9146 			alu.last = 1;
9147 
9148 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9149 			if (r)
9150 				return r;
9151 		}
9152 	}
9153 
9154 	/* result.w = 1.0;*/
9155 	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9156 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9157 
9158 		alu.op = ALU_OP1_MOV;
9159 		alu.src[0].sel = V_SQ_ALU_SRC_1;
9160 		alu.src[0].chan = 0;
9161 
9162 		alu.dst.sel = ctx->temp_reg;
9163 		alu.dst.chan = 3;
9164 		alu.dst.write = 1;
9165 		alu.last = 1;
9166 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9167 		if (r)
9168 			return r;
9169 	}
9170 	return tgsi_helper_copy(ctx, inst);
9171 }
9172 
tgsi_log(struct r600_shader_ctx * ctx)9173 static int tgsi_log(struct r600_shader_ctx *ctx)
9174 {
9175 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9176 	struct r600_bytecode_alu alu;
9177 	int r;
9178 	unsigned i;
9179 
9180 	/* result.x = floor(log2(|src|)); */
9181 	if (inst->Dst[0].Register.WriteMask & 1) {
9182 		if (ctx->bc->chip_class == CAYMAN) {
9183 			for (i = 0; i < 3; i++) {
9184 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9185 
9186 				alu.op = ALU_OP1_LOG_IEEE;
9187 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9188 				r600_bytecode_src_set_abs(&alu.src[0]);
9189 
9190 				alu.dst.sel = ctx->temp_reg;
9191 				alu.dst.chan = i;
9192 				if (i == 0)
9193 					alu.dst.write = 1;
9194 				if (i == 2)
9195 					alu.last = 1;
9196 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9197 				if (r)
9198 					return r;
9199 			}
9200 
9201 		} else {
9202 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9203 
9204 			alu.op = ALU_OP1_LOG_IEEE;
9205 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9206 			r600_bytecode_src_set_abs(&alu.src[0]);
9207 
9208 			alu.dst.sel = ctx->temp_reg;
9209 			alu.dst.chan = 0;
9210 			alu.dst.write = 1;
9211 			alu.last = 1;
9212 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9213 			if (r)
9214 				return r;
9215 		}
9216 
9217 		alu.op = ALU_OP1_FLOOR;
9218 		alu.src[0].sel = ctx->temp_reg;
9219 		alu.src[0].chan = 0;
9220 
9221 		alu.dst.sel = ctx->temp_reg;
9222 		alu.dst.chan = 0;
9223 		alu.dst.write = 1;
9224 		alu.last = 1;
9225 
9226 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9227 		if (r)
9228 			return r;
9229 	}
9230 
9231 	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9232 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9233 
9234 		if (ctx->bc->chip_class == CAYMAN) {
9235 			for (i = 0; i < 3; i++) {
9236 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9237 
9238 				alu.op = ALU_OP1_LOG_IEEE;
9239 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9240 				r600_bytecode_src_set_abs(&alu.src[0]);
9241 
9242 				alu.dst.sel = ctx->temp_reg;
9243 				alu.dst.chan = i;
9244 				if (i == 1)
9245 					alu.dst.write = 1;
9246 				if (i == 2)
9247 					alu.last = 1;
9248 
9249 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9250 				if (r)
9251 					return r;
9252 			}
9253 		} else {
9254 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9255 
9256 			alu.op = ALU_OP1_LOG_IEEE;
9257 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9258 			r600_bytecode_src_set_abs(&alu.src[0]);
9259 
9260 			alu.dst.sel = ctx->temp_reg;
9261 			alu.dst.chan = 1;
9262 			alu.dst.write = 1;
9263 			alu.last = 1;
9264 
9265 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9266 			if (r)
9267 				return r;
9268 		}
9269 
9270 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9271 
9272 		alu.op = ALU_OP1_FLOOR;
9273 		alu.src[0].sel = ctx->temp_reg;
9274 		alu.src[0].chan = 1;
9275 
9276 		alu.dst.sel = ctx->temp_reg;
9277 		alu.dst.chan = 1;
9278 		alu.dst.write = 1;
9279 		alu.last = 1;
9280 
9281 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9282 		if (r)
9283 			return r;
9284 
9285 		if (ctx->bc->chip_class == CAYMAN) {
9286 			for (i = 0; i < 3; i++) {
9287 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9288 				alu.op = ALU_OP1_EXP_IEEE;
9289 				alu.src[0].sel = ctx->temp_reg;
9290 				alu.src[0].chan = 1;
9291 
9292 				alu.dst.sel = ctx->temp_reg;
9293 				alu.dst.chan = i;
9294 				if (i == 1)
9295 					alu.dst.write = 1;
9296 				if (i == 2)
9297 					alu.last = 1;
9298 
9299 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9300 				if (r)
9301 					return r;
9302 			}
9303 		} else {
9304 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9305 			alu.op = ALU_OP1_EXP_IEEE;
9306 			alu.src[0].sel = ctx->temp_reg;
9307 			alu.src[0].chan = 1;
9308 
9309 			alu.dst.sel = ctx->temp_reg;
9310 			alu.dst.chan = 1;
9311 			alu.dst.write = 1;
9312 			alu.last = 1;
9313 
9314 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9315 			if (r)
9316 				return r;
9317 		}
9318 
9319 		if (ctx->bc->chip_class == CAYMAN) {
9320 			for (i = 0; i < 3; i++) {
9321 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9322 				alu.op = ALU_OP1_RECIP_IEEE;
9323 				alu.src[0].sel = ctx->temp_reg;
9324 				alu.src[0].chan = 1;
9325 
9326 				alu.dst.sel = ctx->temp_reg;
9327 				alu.dst.chan = i;
9328 				if (i == 1)
9329 					alu.dst.write = 1;
9330 				if (i == 2)
9331 					alu.last = 1;
9332 
9333 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9334 				if (r)
9335 					return r;
9336 			}
9337 		} else {
9338 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9339 			alu.op = ALU_OP1_RECIP_IEEE;
9340 			alu.src[0].sel = ctx->temp_reg;
9341 			alu.src[0].chan = 1;
9342 
9343 			alu.dst.sel = ctx->temp_reg;
9344 			alu.dst.chan = 1;
9345 			alu.dst.write = 1;
9346 			alu.last = 1;
9347 
9348 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9349 			if (r)
9350 				return r;
9351 		}
9352 
9353 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9354 
9355 		alu.op = ALU_OP2_MUL;
9356 
9357 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9358 		r600_bytecode_src_set_abs(&alu.src[0]);
9359 
9360 		alu.src[1].sel = ctx->temp_reg;
9361 		alu.src[1].chan = 1;
9362 
9363 		alu.dst.sel = ctx->temp_reg;
9364 		alu.dst.chan = 1;
9365 		alu.dst.write = 1;
9366 		alu.last = 1;
9367 
9368 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9369 		if (r)
9370 			return r;
9371 	}
9372 
9373 	/* result.z = log2(|src|);*/
9374 	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
9375 		if (ctx->bc->chip_class == CAYMAN) {
9376 			for (i = 0; i < 3; i++) {
9377 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9378 
9379 				alu.op = ALU_OP1_LOG_IEEE;
9380 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9381 				r600_bytecode_src_set_abs(&alu.src[0]);
9382 
9383 				alu.dst.sel = ctx->temp_reg;
9384 				if (i == 2)
9385 					alu.dst.write = 1;
9386 				alu.dst.chan = i;
9387 				if (i == 2)
9388 					alu.last = 1;
9389 
9390 				r = r600_bytecode_add_alu(ctx->bc, &alu);
9391 				if (r)
9392 					return r;
9393 			}
9394 		} else {
9395 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9396 
9397 			alu.op = ALU_OP1_LOG_IEEE;
9398 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9399 			r600_bytecode_src_set_abs(&alu.src[0]);
9400 
9401 			alu.dst.sel = ctx->temp_reg;
9402 			alu.dst.write = 1;
9403 			alu.dst.chan = 2;
9404 			alu.last = 1;
9405 
9406 			r = r600_bytecode_add_alu(ctx->bc, &alu);
9407 			if (r)
9408 				return r;
9409 		}
9410 	}
9411 
9412 	/* result.w = 1.0; */
9413 	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
9414 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9415 
9416 		alu.op = ALU_OP1_MOV;
9417 		alu.src[0].sel = V_SQ_ALU_SRC_1;
9418 		alu.src[0].chan = 0;
9419 
9420 		alu.dst.sel = ctx->temp_reg;
9421 		alu.dst.chan = 3;
9422 		alu.dst.write = 1;
9423 		alu.last = 1;
9424 
9425 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9426 		if (r)
9427 			return r;
9428 	}
9429 
9430 	return tgsi_helper_copy(ctx, inst);
9431 }
9432 
tgsi_eg_arl(struct r600_shader_ctx * ctx)9433 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
9434 {
9435 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9436 	struct r600_bytecode_alu alu;
9437 	int r;
9438 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9439 	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
9440 
9441 	assert(inst->Dst[0].Register.Index < 3);
9442 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9443 
9444 	switch (inst->Instruction.Opcode) {
9445 	case TGSI_OPCODE_ARL:
9446 		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
9447 		break;
9448 	case TGSI_OPCODE_ARR:
9449 		alu.op = ALU_OP1_FLT_TO_INT;
9450 		break;
9451 	case TGSI_OPCODE_UARL:
9452 		alu.op = ALU_OP1_MOV;
9453 		break;
9454 	default:
9455 		assert(0);
9456 		return -1;
9457 	}
9458 
9459 	for (i = 0; i <= lasti; ++i) {
9460 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9461 			continue;
9462 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9463 		alu.last = i == lasti;
9464 		alu.dst.sel = reg;
9465 	        alu.dst.chan = i;
9466 		alu.dst.write = 1;
9467 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9468 		if (r)
9469 			return r;
9470 	}
9471 
9472 	if (inst->Dst[0].Register.Index > 0)
9473 		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
9474 	else
9475 		ctx->bc->ar_loaded = 0;
9476 
9477 	return 0;
9478 }
tgsi_r600_arl(struct r600_shader_ctx * ctx)9479 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
9480 {
9481 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9482 	struct r600_bytecode_alu alu;
9483 	int r;
9484 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9485 
9486 	switch (inst->Instruction.Opcode) {
9487 	case TGSI_OPCODE_ARL:
9488 		memset(&alu, 0, sizeof(alu));
9489 		alu.op = ALU_OP1_FLOOR;
9490 		alu.dst.sel = ctx->bc->ar_reg;
9491 		alu.dst.write = 1;
9492 		for (i = 0; i <= lasti; ++i) {
9493 			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
9494 				alu.dst.chan = i;
9495 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9496 				alu.last = i == lasti;
9497 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9498 					return r;
9499 			}
9500 		}
9501 
9502 		memset(&alu, 0, sizeof(alu));
9503 		alu.op = ALU_OP1_FLT_TO_INT;
9504 		alu.src[0].sel = ctx->bc->ar_reg;
9505 		alu.dst.sel = ctx->bc->ar_reg;
9506 		alu.dst.write = 1;
9507 		/* FLT_TO_INT is trans-only on r600/r700 */
9508 		alu.last = TRUE;
9509 		for (i = 0; i <= lasti; ++i) {
9510 			alu.dst.chan = i;
9511 			alu.src[0].chan = i;
9512 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9513 				return r;
9514 		}
9515 		break;
9516 	case TGSI_OPCODE_ARR:
9517 		memset(&alu, 0, sizeof(alu));
9518 		alu.op = ALU_OP1_FLT_TO_INT;
9519 		alu.dst.sel = ctx->bc->ar_reg;
9520 		alu.dst.write = 1;
9521 		/* FLT_TO_INT is trans-only on r600/r700 */
9522 		alu.last = TRUE;
9523 		for (i = 0; i <= lasti; ++i) {
9524 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9525 				alu.dst.chan = i;
9526 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9527 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9528 					return r;
9529 			}
9530 		}
9531 		break;
9532 	case TGSI_OPCODE_UARL:
9533 		memset(&alu, 0, sizeof(alu));
9534 		alu.op = ALU_OP1_MOV;
9535 		alu.dst.sel = ctx->bc->ar_reg;
9536 		alu.dst.write = 1;
9537 		for (i = 0; i <= lasti; ++i) {
9538 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9539 				alu.dst.chan = i;
9540 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9541 				alu.last = i == lasti;
9542 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9543 					return r;
9544 			}
9545 		}
9546 		break;
9547 	default:
9548 		assert(0);
9549 		return -1;
9550 	}
9551 
9552 	ctx->bc->ar_loaded = 0;
9553 	return 0;
9554 }
9555 
tgsi_opdst(struct r600_shader_ctx * ctx)9556 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9557 {
9558 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9559 	struct r600_bytecode_alu alu;
9560 	int i, r = 0;
9561 
9562 	for (i = 0; i < 4; i++) {
9563 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9564 
9565 		alu.op = ALU_OP2_MUL;
9566 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9567 
9568 		if (i == 0 || i == 3) {
9569 			alu.src[0].sel = V_SQ_ALU_SRC_1;
9570 		} else {
9571 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9572 		}
9573 
9574 		if (i == 0 || i == 2) {
9575 			alu.src[1].sel = V_SQ_ALU_SRC_1;
9576 		} else {
9577 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9578 		}
9579 		if (i == 3)
9580 			alu.last = 1;
9581 		r = r600_bytecode_add_alu(ctx->bc, &alu);
9582 		if (r)
9583 			return r;
9584 	}
9585 	return 0;
9586 }
9587 
emit_logic_pred(struct r600_shader_ctx * ctx,int opcode,int alu_type)9588 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
9589 {
9590 	struct r600_bytecode_alu alu;
9591 	int r;
9592 
9593 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9594 	alu.op = opcode;
9595 	alu.execute_mask = 1;
9596 	alu.update_pred = 1;
9597 
9598 	alu.dst.sel = ctx->temp_reg;
9599 	alu.dst.write = 1;
9600 	alu.dst.chan = 0;
9601 
9602 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9603 	alu.src[1].sel = V_SQ_ALU_SRC_0;
9604 	alu.src[1].chan = 0;
9605 
9606 	alu.last = 1;
9607 
9608 	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9609 	if (r)
9610 		return r;
9611 	return 0;
9612 }
9613 
pops(struct r600_shader_ctx * ctx,int pops)9614 static int pops(struct r600_shader_ctx *ctx, int pops)
9615 {
9616 	unsigned force_pop = ctx->bc->force_add_cf;
9617 
9618 	if (!force_pop) {
9619 		int alu_pop = 3;
9620 		if (ctx->bc->cf_last) {
9621 			if (ctx->bc->cf_last->op == CF_OP_ALU)
9622 				alu_pop = 0;
9623 			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9624 				alu_pop = 1;
9625 		}
9626 		alu_pop += pops;
9627 		if (alu_pop == 1) {
9628 			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9629 			ctx->bc->force_add_cf = 1;
9630 		} else if (alu_pop == 2) {
9631 			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9632 			ctx->bc->force_add_cf = 1;
9633 		} else {
9634 			force_pop = 1;
9635 		}
9636 	}
9637 
9638 	if (force_pop) {
9639 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9640 		ctx->bc->cf_last->pop_count = pops;
9641 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9642 	}
9643 
9644 	return 0;
9645 }
9646 
callstack_update_max_depth(struct r600_shader_ctx * ctx,unsigned reason)9647 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
9648                                               unsigned reason)
9649 {
9650 	struct r600_stack_info *stack = &ctx->bc->stack;
9651 	unsigned elements;
9652 	int entries;
9653 
9654 	unsigned entry_size = stack->entry_size;
9655 
9656 	elements = (stack->loop + stack->push_wqm ) * entry_size;
9657 	elements += stack->push;
9658 
9659 	switch (ctx->bc->chip_class) {
9660 	case R600:
9661 	case R700:
9662 		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9663 		 * the stack must be reserved to hold the current active/continue
9664 		 * masks */
9665 		if (reason == FC_PUSH_VPM || stack->push > 0) {
9666 			elements += 2;
9667 		}
9668 		break;
9669 
9670 	case CAYMAN:
9671 		/* r9xx: any stack operation on empty stack consumes 2 additional
9672 		 * elements */
9673 		elements += 2;
9674 
9675 		/* fallthrough */
9676 		/* FIXME: do the two elements added above cover the cases for the
9677 		 * r8xx+ below? */
9678 
9679 	case EVERGREEN:
9680 		/* r8xx+: 2 extra elements are not always required, but one extra
9681 		 * element must be added for each of the following cases:
9682 		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9683 		 *    stack usage.
9684 		 *    (Currently we don't use ALU_ELSE_AFTER.)
9685 		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9686 		 *    PUSH instruction executed.
9687 		 *
9688 		 *    NOTE: it seems we also need to reserve additional element in some
9689 		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9690 		 *    then STACK_SIZE should be 2 instead of 1 */
9691 		if (reason == FC_PUSH_VPM || stack->push > 0) {
9692 			elements += 1;
9693 		}
9694 		break;
9695 
9696 	default:
9697 		assert(0);
9698 		break;
9699 	}
9700 
9701 	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9702 	 * for all chips, so we use 4 in the final formula, not the real entry_size
9703 	 * for the chip */
9704 	entry_size = 4;
9705 
9706 	entries = (elements + (entry_size - 1)) / entry_size;
9707 
9708 	if (entries > stack->max_entries)
9709 		stack->max_entries = entries;
9710 	return elements;
9711 }
9712 
callstack_pop(struct r600_shader_ctx * ctx,unsigned reason)9713 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9714 {
9715 	switch(reason) {
9716 	case FC_PUSH_VPM:
9717 		--ctx->bc->stack.push;
9718 		assert(ctx->bc->stack.push >= 0);
9719 		break;
9720 	case FC_PUSH_WQM:
9721 		--ctx->bc->stack.push_wqm;
9722 		assert(ctx->bc->stack.push_wqm >= 0);
9723 		break;
9724 	case FC_LOOP:
9725 		--ctx->bc->stack.loop;
9726 		assert(ctx->bc->stack.loop >= 0);
9727 		break;
9728 	default:
9729 		assert(0);
9730 		break;
9731 	}
9732 }
9733 
callstack_push(struct r600_shader_ctx * ctx,unsigned reason)9734 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9735 {
9736 	switch (reason) {
9737 	case FC_PUSH_VPM:
9738 		++ctx->bc->stack.push;
9739 		break;
9740 	case FC_PUSH_WQM:
9741 		++ctx->bc->stack.push_wqm;
9742 		break;
9743 	case FC_LOOP:
9744 		++ctx->bc->stack.loop;
9745 		break;
9746 	default:
9747 		assert(0);
9748 	}
9749 
9750 	return callstack_update_max_depth(ctx, reason);
9751 }
9752 
fc_set_mid(struct r600_shader_ctx * ctx,int fc_sp)9753 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
9754 {
9755 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
9756 
9757 	sp->mid = realloc((void *)sp->mid,
9758 						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
9759 	sp->mid[sp->num_mid] = ctx->bc->cf_last;
9760 	sp->num_mid++;
9761 }
9762 
fc_pushlevel(struct r600_shader_ctx * ctx,int type)9763 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
9764 {
9765 	assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
9766 	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
9767 	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
9768 	ctx->bc->fc_sp++;
9769 }
9770 
fc_poplevel(struct r600_shader_ctx * ctx)9771 static void fc_poplevel(struct r600_shader_ctx *ctx)
9772 {
9773 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
9774 	free(sp->mid);
9775 	sp->mid = NULL;
9776 	sp->num_mid = 0;
9777 	sp->start = NULL;
9778 	sp->type = 0;
9779 	ctx->bc->fc_sp--;
9780 }
9781 
9782 #if 0
9783 static int emit_return(struct r600_shader_ctx *ctx)
9784 {
9785 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
9786 	return 0;
9787 }
9788 
9789 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
9790 {
9791 
9792 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
9793 	ctx->bc->cf_last->pop_count = pops;
9794 	/* XXX work out offset */
9795 	return 0;
9796 }
9797 
9798 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
9799 {
9800 	return 0;
9801 }
9802 
9803 static void emit_testflag(struct r600_shader_ctx *ctx)
9804 {
9805 
9806 }
9807 
9808 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
9809 {
9810 	emit_testflag(ctx);
9811 	emit_jump_to_offset(ctx, 1, 4);
9812 	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
9813 	pops(ctx, ifidx + 1);
9814 	emit_return(ctx);
9815 }
9816 
9817 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
9818 {
9819 	emit_testflag(ctx);
9820 
9821 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9822 	ctx->bc->cf_last->pop_count = 1;
9823 
9824 	fc_set_mid(ctx, fc_sp);
9825 
9826 	pops(ctx, 1);
9827 }
9828 #endif
9829 
emit_if(struct r600_shader_ctx * ctx,int opcode)9830 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
9831 {
9832 	int alu_type = CF_OP_ALU_PUSH_BEFORE;
9833 	bool needs_workaround = false;
9834 	int elems = callstack_push(ctx, FC_PUSH_VPM);
9835 
9836 	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
9837 		needs_workaround = true;
9838 
9839 	if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
9840 		unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
9841 		unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
9842 
9843 		if (elems && (!dmod1 || !dmod2))
9844 			needs_workaround = true;
9845 	}
9846 
9847 	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
9848 	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
9849 	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
9850 	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
9851 	if (needs_workaround) {
9852 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
9853 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9854 		alu_type = CF_OP_ALU;
9855 	}
9856 
9857 	emit_logic_pred(ctx, opcode, alu_type);
9858 
9859 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
9860 
9861 	fc_pushlevel(ctx, FC_IF);
9862 
9863 	return 0;
9864 }
9865 
tgsi_if(struct r600_shader_ctx * ctx)9866 static int tgsi_if(struct r600_shader_ctx *ctx)
9867 {
9868 	return emit_if(ctx, ALU_OP2_PRED_SETNE);
9869 }
9870 
tgsi_uif(struct r600_shader_ctx * ctx)9871 static int tgsi_uif(struct r600_shader_ctx *ctx)
9872 {
9873 	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
9874 }
9875 
tgsi_else(struct r600_shader_ctx * ctx)9876 static int tgsi_else(struct r600_shader_ctx *ctx)
9877 {
9878 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
9879 	ctx->bc->cf_last->pop_count = 1;
9880 
9881 	fc_set_mid(ctx, ctx->bc->fc_sp - 1);
9882 	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
9883 	return 0;
9884 }
9885 
tgsi_endif(struct r600_shader_ctx * ctx)9886 static int tgsi_endif(struct r600_shader_ctx *ctx)
9887 {
9888 	int offset = 2;
9889 	pops(ctx, 1);
9890 	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
9891 		R600_ERR("if/endif unbalanced in shader\n");
9892 		return -1;
9893 	}
9894 
9895 	/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
9896 	if (ctx->bc->cf_last->eg_alu_extended)
9897 			offset += 2;
9898 
9899 	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
9900 		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
9901 		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
9902 	} else {
9903 		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
9904 	}
9905 	fc_poplevel(ctx);
9906 
9907 	callstack_pop(ctx, FC_PUSH_VPM);
9908 	return 0;
9909 }
9910 
tgsi_bgnloop(struct r600_shader_ctx * ctx)9911 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
9912 {
9913 	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
9914 	 * limited to 4096 iterations, like the other LOOP_* instructions. */
9915 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
9916 
9917 	fc_pushlevel(ctx, FC_LOOP);
9918 
9919 	/* check stack depth */
9920 	callstack_push(ctx, FC_LOOP);
9921 	return 0;
9922 }
9923 
tgsi_endloop(struct r600_shader_ctx * ctx)9924 static int tgsi_endloop(struct r600_shader_ctx *ctx)
9925 {
9926 	int i;
9927 
9928 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
9929 
9930 	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
9931 		R600_ERR("loop/endloop in shader code are not paired.\n");
9932 		return -EINVAL;
9933 	}
9934 
9935 	/* fixup loop pointers - from r600isa
9936 	   LOOP END points to CF after LOOP START,
9937 	   LOOP START point to CF after LOOP END
9938 	   BRK/CONT point to LOOP END CF
9939 	*/
9940 	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
9941 
9942 	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9943 
9944 	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
9945 		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
9946 	}
9947 	/* XXX add LOOPRET support */
9948 	fc_poplevel(ctx);
9949 	callstack_pop(ctx, FC_LOOP);
9950 	return 0;
9951 }
9952 
tgsi_loop_brk_cont(struct r600_shader_ctx * ctx)9953 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
9954 {
9955 	unsigned int fscp;
9956 
9957 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
9958 	{
9959 		if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
9960 			break;
9961 	}
9962 
9963 	if (fscp == 0) {
9964 		R600_ERR("Break not inside loop/endloop pair\n");
9965 		return -EINVAL;
9966 	}
9967 
9968 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9969 
9970 	fc_set_mid(ctx, fscp - 1);
9971 
9972 	return 0;
9973 }
9974 
tgsi_gs_emit(struct r600_shader_ctx * ctx)9975 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
9976 {
9977 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9978 	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
9979 	int r;
9980 
9981 	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9982 		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
9983 
9984 	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9985 	if (!r) {
9986 		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
9987 		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9988 			return emit_inc_ring_offset(ctx, stream, TRUE);
9989 	}
9990 	return r;
9991 }
9992 
tgsi_umad(struct r600_shader_ctx * ctx)9993 static int tgsi_umad(struct r600_shader_ctx *ctx)
9994 {
9995 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9996 	struct r600_bytecode_alu alu;
9997 	int i, j, k, r;
9998 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9999 
10000 	/* src0 * src1 */
10001 	for (i = 0; i < lasti + 1; i++) {
10002 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10003 			continue;
10004 
10005 		if (ctx->bc->chip_class == CAYMAN) {
10006 			for (j = 0 ; j < 4; j++) {
10007 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10008 
10009 				alu.op = ALU_OP2_MULLO_UINT;
10010 				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
10011 					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
10012 				}
10013 				alu.dst.chan = j;
10014 				alu.dst.sel = ctx->temp_reg;
10015 				alu.dst.write = (j == i);
10016 				if (j == 3)
10017 					alu.last = 1;
10018 				r = r600_bytecode_add_alu(ctx->bc, &alu);
10019 				if (r)
10020 					return r;
10021 			}
10022 		} else {
10023 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10024 
10025 			alu.dst.chan = i;
10026 			alu.dst.sel = ctx->temp_reg;
10027 			alu.dst.write = 1;
10028 
10029 			alu.op = ALU_OP2_MULLO_UINT;
10030 			for (j = 0; j < 2; j++) {
10031 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10032 			}
10033 
10034 			alu.last = 1;
10035 			r = r600_bytecode_add_alu(ctx->bc, &alu);
10036 			if (r)
10037 				return r;
10038 		}
10039 	}
10040 
10041 
10042 	for (i = 0; i < lasti + 1; i++) {
10043 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10044 			continue;
10045 
10046 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10047 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10048 
10049 		alu.op = ALU_OP2_ADD_INT;
10050 
10051 		alu.src[0].sel = ctx->temp_reg;
10052 		alu.src[0].chan = i;
10053 
10054 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10055 		if (i == lasti) {
10056 			alu.last = 1;
10057 		}
10058 		r = r600_bytecode_add_alu(ctx->bc, &alu);
10059 		if (r)
10060 			return r;
10061 	}
10062 	return 0;
10063 }
10064 
tgsi_pk2h(struct r600_shader_ctx * ctx)10065 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10066 {
10067 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10068 	struct r600_bytecode_alu alu;
10069 	int r, i;
10070 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10071 
10072 	/* temp.xy = f32_to_f16(src) */
10073 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10074 	alu.op = ALU_OP1_FLT32_TO_FLT16;
10075 	alu.dst.chan = 0;
10076 	alu.dst.sel = ctx->temp_reg;
10077 	alu.dst.write = 1;
10078 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10079 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10080 	if (r)
10081 		return r;
10082 	alu.dst.chan = 1;
10083 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10084 	alu.last = 1;
10085 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10086 	if (r)
10087 		return r;
10088 
10089 	/* dst.x = temp.y * 0x10000 + temp.x */
10090 	for (i = 0; i < lasti + 1; i++) {
10091 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10092 			continue;
10093 
10094 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10095 		alu.op = ALU_OP3_MULADD_UINT24;
10096 		alu.is_op3 = 1;
10097 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10098 		alu.last = i == lasti;
10099 		alu.src[0].sel = ctx->temp_reg;
10100 		alu.src[0].chan = 1;
10101 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10102 		alu.src[1].value = 0x10000;
10103 		alu.src[2].sel = ctx->temp_reg;
10104 		alu.src[2].chan = 0;
10105 		r = r600_bytecode_add_alu(ctx->bc, &alu);
10106 		if (r)
10107 			return r;
10108 	}
10109 
10110 	return 0;
10111 }
10112 
tgsi_up2h(struct r600_shader_ctx * ctx)10113 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10114 {
10115 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10116 	struct r600_bytecode_alu alu;
10117 	int r, i;
10118 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10119 
10120 	/* temp.x = src.x */
10121 	/* note: no need to mask out the high bits */
10122 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10123 	alu.op = ALU_OP1_MOV;
10124 	alu.dst.chan = 0;
10125 	alu.dst.sel = ctx->temp_reg;
10126 	alu.dst.write = 1;
10127 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10128 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10129 	if (r)
10130 		return r;
10131 
10132 	/* temp.y = src.x >> 16 */
10133 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10134 	alu.op = ALU_OP2_LSHR_INT;
10135 	alu.dst.chan = 1;
10136 	alu.dst.sel = ctx->temp_reg;
10137 	alu.dst.write = 1;
10138 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10139 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10140 	alu.src[1].value = 16;
10141 	alu.last = 1;
10142 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10143 	if (r)
10144 		return r;
10145 
10146 	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10147 	for (i = 0; i < lasti + 1; i++) {
10148 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10149 			continue;
10150 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10151 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10152 		alu.op = ALU_OP1_FLT16_TO_FLT32;
10153 		alu.src[0].sel = ctx->temp_reg;
10154 		alu.src[0].chan = i % 2;
10155 		alu.last = i == lasti;
10156 		r = r600_bytecode_add_alu(ctx->bc, &alu);
10157 		if (r)
10158 			return r;
10159 	}
10160 
10161 	return 0;
10162 }
10163 
tgsi_bfe(struct r600_shader_ctx * ctx)10164 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10165 {
10166 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10167 	struct r600_bytecode_alu alu;
10168 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10169 	int r, i;
10170 	int dst = -1;
10171 
10172 	if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10173 	     inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10174 	    (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10175 	     inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10176 		dst = r600_get_temp(ctx);
10177 
10178 	r = tgsi_op3_dst(ctx, dst);
10179 	if (r)
10180 		return r;
10181 
10182 	for (i = 0; i < lasti + 1; i++) {
10183 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10184 		alu.op = ALU_OP2_SETGE_INT;
10185 		r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10186 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10187 		alu.src[1].value = 32;
10188 		alu.dst.sel = ctx->temp_reg;
10189 		alu.dst.chan = i;
10190 		alu.dst.write = 1;
10191 		if (i == lasti)
10192 			alu.last = 1;
10193 		r = r600_bytecode_add_alu(ctx->bc, &alu);
10194 		if (r)
10195 			return r;
10196 	}
10197 
10198 	for (i = 0; i < lasti + 1; i++) {
10199 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10200 		alu.op = ALU_OP3_CNDE_INT;
10201 		alu.is_op3 = 1;
10202 		alu.src[0].sel = ctx->temp_reg;
10203 		alu.src[0].chan = i;
10204 
10205 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10206 		if (dst != -1)
10207 			alu.src[1].sel = dst;
10208 		else
10209 			alu.src[1].sel = alu.dst.sel;
10210 		alu.src[1].chan = i;
10211 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10212 		alu.dst.write = 1;
10213 		if (i == lasti)
10214 			alu.last = 1;
10215 		r = r600_bytecode_add_alu(ctx->bc, &alu);
10216 		if (r)
10217 			return r;
10218 	}
10219 
10220 	return 0;
10221 }
10222 
tgsi_clock(struct r600_shader_ctx * ctx)10223 static int tgsi_clock(struct r600_shader_ctx *ctx)
10224 {
10225 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10226 	struct r600_bytecode_alu alu;
10227 	int r;
10228 
10229 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10230 	alu.op = ALU_OP1_MOV;
10231 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10232 	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10233 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10234 	if (r)
10235 		return r;
10236 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10237 	alu.op = ALU_OP1_MOV;
10238 	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10239 	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10240 	alu.last = 1;
10241 	r = r600_bytecode_add_alu(ctx->bc, &alu);
10242 	if (r)
10243 		return r;
10244 	return 0;
10245 }
10246 
10247 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
10248 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
10249 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
10250 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
10251 
10252 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10253 
10254 	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
10255 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
10256 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
10257 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
10258 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
10259 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10260 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10261 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
10262 	/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
10263 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
10264 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
10265 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
10266 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
10267 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
10268 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
10269 	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
10270 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10271 	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
10272 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
10273 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
10274 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
10275 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
10276 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
10277 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
10278 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10279 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10280 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
10281 	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
10282 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
10283 	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
10284 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
10285 	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
10286 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
10287 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10288 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10289 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
10290 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
10291 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10292 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10293 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10294 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
10295 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
10296 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
10297 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
10298 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
10299 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
10300 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
10301 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
10302 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
10303 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
10304 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
10305 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
10306 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10307 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10308 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10309 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
10310 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
10311 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
10312 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
10313 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
10314 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
10315 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
10316 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
10317 	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
10318 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10319 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
10320 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
10321 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10322 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10323 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10324 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
10325 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
10326 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
10327 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
10328 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
10329 	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
10330 	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
10331 	[81]			= { ALU_OP0_NOP, tgsi_unsupported},
10332 	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
10333 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
10334 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10335 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
10336 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
10337 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
10338 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
10339 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
10340 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
10341 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
10342 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
10343 	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
10344 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
10345 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10346 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10347 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10348 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10349 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
10350 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10351 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
10352 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10353 	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10354 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10355 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
10356 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
10357 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
10358 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
10359 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
10360 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10361 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10362 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
10363 	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
10364 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
10365 	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
10366 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
10367 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
10368 	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
10369 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
10370 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
10371 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
10372 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
10373 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
10374 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
10375 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
10376 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10377 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
10378 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10379 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
10380 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
10381 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
10382 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
10383 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
10384 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
10385 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10386 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
10387 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
10388 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
10389 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10390 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
10391 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10392 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
10393 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
10394 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10395 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
10396 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
10397 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
10398 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
10399 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
10400 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
10401 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
10402 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
10403 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
10404 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
10405 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
10406 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
10407 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
10408 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
10409 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
10410 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
10411 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
10412 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
10413 	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
10414 	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
10415 	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
10416 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
10417 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
10418 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
10419 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
10420 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
10421 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
10422 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
10423 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
10424 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
10425 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
10426 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
10427 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
10428 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10429 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10430 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10431 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10432 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
10433 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
10434 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
10435 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
10436 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
10437 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
10438 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
10439 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
10440 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
10441 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
10442 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
10443 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
10444 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
10445 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
10446 };
10447 
10448 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
10449 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
10450 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
10451 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
10452 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10453 	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
10454 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
10455 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
10456 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
10457 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
10458 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10459 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10460 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
10461 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
10462 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
10463 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
10464 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
10465 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
10466 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
10467 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
10468 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10469 	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
10470 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
10471 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
10472 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
10473 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
10474 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
10475 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
10476 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10477 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10478 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
10479 	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
10480 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
10481 	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
10482 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
10483 	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
10484 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
10485 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10486 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10487 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
10488 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
10489 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10490 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10491 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10492 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
10493 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
10494 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
10495 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
10496 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
10497 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
10498 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
10499 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
10500 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
10501 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
10502 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
10503 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
10504 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10505 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10506 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10507 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
10508 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
10509 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
10510 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
10511 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
10512 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
10513 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
10514 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
10515 	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
10516 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10517 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
10518 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
10519 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10520 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10521 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10522 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
10523 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
10524 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
10525 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
10526 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
10527 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10528 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10529 	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
10530 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
10531 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10532 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
10533 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
10534 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
10535 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
10536 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
10537 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
10538 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
10539 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
10540 	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
10541 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
10542 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10543 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10544 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10545 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10546 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
10547 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10548 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
10549 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10550 	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10551 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10552 	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10553 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
10554 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
10555 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
10556 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
10557 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10558 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10559 	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10560 	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
10561 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
10562 	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
10563 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
10564 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
10565 	/* Refer below for TGSI_OPCODE_DFMA */
10566 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
10567 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
10568 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
10569 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
10570 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
10571 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
10572 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
10573 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10574 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
10575 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10576 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
10577 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
10578 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
10579 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
10580 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
10581 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
10582 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10583 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
10584 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
10585 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
10586 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10587 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
10588 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10589 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
10590 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
10591 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10592 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
10593 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
10594 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
10595 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
10596 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
10597 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
10598 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
10599 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
10600 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
10601 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
10602 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
10603 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
10604 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10605 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
10606 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
10607 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
10608 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
10609 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
10610 	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
10611 	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
10612 	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
10613 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10614 	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10615 	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10616 	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10617 	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10618 	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10619 	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10620 	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10621 	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10622 	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10623 	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10624 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
10625 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10626 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10627 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10628 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10629 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
10630 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
10631 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
10632 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
10633 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
10634 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
10635 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
10636 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
10637 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
10638 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
10639 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10640 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10641 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10642 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10643 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10644 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
10645 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
10646 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
10647 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
10648 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
10649 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
10650 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
10651 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10652 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10653 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10654 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10655 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10656 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10657 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
10658 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
10659 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
10660 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
10661 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
10662 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10663 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10664 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10665 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10666 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10667 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
10668 };
10669 
10670 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
10671 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
10672 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
10673 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
10674 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
10675 	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
10676 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
10677 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
10678 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
10679 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
10680 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10681 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10682 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
10683 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
10684 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
10685 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
10686 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
10687 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
10688 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
10689 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
10690 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
10691 	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
10692 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
10693 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
10694 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
10695 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
10696 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
10697 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
10698 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
10699 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
10700 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
10701 	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
10702 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
10703 	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
10704 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
10705 	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
10706 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
10707 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10708 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10709 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
10710 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
10711 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10712 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10713 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10714 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
10715 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
10716 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
10717 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
10718 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
10719 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
10720 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
10721 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
10722 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
10723 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
10724 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
10725 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
10726 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
10727 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
10728 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
10729 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
10730 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
10731 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
10732 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
10733 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
10734 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
10735 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
10736 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
10737 	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
10738 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10739 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
10740 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
10741 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
10742 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10743 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10744 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
10745 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
10746 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
10747 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
10748 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
10749 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10750 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10751 	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
10752 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
10753 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
10754 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
10755 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
10756 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
10757 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
10758 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
10759 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
10760 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
10761 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
10762 	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
10763 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
10764 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10765 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10766 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10767 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10768 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
10769 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10770 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
10771 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
10772 	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10773 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10774 	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10775 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
10776 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
10777 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
10778 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
10779 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10780 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10781 	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10782 	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
10783 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
10784 	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
10785 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
10786 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
10787 	/* Refer below for TGSI_OPCODE_DFMA */
10788 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
10789 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
10790 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
10791 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
10792 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
10793 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
10794 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
10795 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10796 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
10797 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
10798 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
10799 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
10800 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
10801 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
10802 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
10803 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
10804 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
10805 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
10806 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
10807 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
10808 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10809 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
10810 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10811 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
10812 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
10813 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
10814 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
10815 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
10816 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
10817 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
10818 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
10819 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
10820 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
10821 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
10822 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
10823 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
10824 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
10825 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
10826 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10827 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
10828 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
10829 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
10830 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
10831 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
10832 	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
10833 	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
10834 	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
10835 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10836 	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10837 	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10838 	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10839 	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10840 	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10841 	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10842 	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10843 	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10844 	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10845 	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10846 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
10847 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
10848 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
10849 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
10850 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
10851 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
10852 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
10853 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
10854 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
10855 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
10856 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
10857 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
10858 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
10859 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
10860 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
10861 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10862 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10863 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
10864 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10865 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10866 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
10867 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
10868 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
10869 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
10870 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
10871 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
10872 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
10873 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10874 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10875 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10876 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10877 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10878 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10879 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
10880 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
10881 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
10882 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
10883 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
10884 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10885 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10886 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10887 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10888 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10889 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
10890 };
10891