r600_shader.c revision da676eab93e7dad30b574b4eb4cffd4df952e819
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader);
107
108int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader)
109{
110	static int dump_shaders = -1;
111	struct r600_context *rctx = (struct r600_context *)ctx;
112	struct r600_pipe_shader_selector *sel = shader->selector;
113	int r;
114
115	/* Would like some magic "get_bool_option_once" routine.
116	*/
117	if (dump_shaders == -1)
118		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
119
120	if (dump_shaders) {
121		fprintf(stderr, "--------------------------------------------------------------\n");
122		tgsi_dump(sel->tokens, 0);
123
124		if (sel->so.num_outputs) {
125			unsigned i;
126			fprintf(stderr, "STREAMOUT\n");
127			for (i = 0; i < sel->so.num_outputs; i++) {
128				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
129						sel->so.output[i].start_component;
130				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
131					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
132				        mask & 1 ? "x" : "_",
133				        (mask >> 1) & 1 ? "y" : "_",
134				        (mask >> 2) & 1 ? "z" : "_",
135				        (mask >> 3) & 1 ? "w" : "_");
136			}
137		}
138	}
139	r = r600_shader_from_tgsi(rctx, shader);
140	if (r) {
141		R600_ERR("translation from TGSI failed !\n");
142		return r;
143	}
144	r = r600_bytecode_build(&shader->shader.bc);
145	if (r) {
146		R600_ERR("building bytecode failed !\n");
147		return r;
148	}
149	if (dump_shaders) {
150		r600_bytecode_dump(&shader->shader.bc);
151		fprintf(stderr, "______________________________________________________________\n");
152	}
153	return r600_pipe_shader(ctx, shader);
154}
155
156void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
157{
158	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
159	r600_bytecode_clear(&shader->shader.bc);
160}
161
162/*
163 * tgsi -> r600 shader
164 */
165struct r600_shader_tgsi_instruction;
166
167struct r600_shader_src {
168	unsigned				sel;
169	unsigned				swizzle[4];
170	unsigned				neg;
171	unsigned				abs;
172	unsigned				rel;
173	uint32_t				value[4];
174};
175
176struct r600_shader_ctx {
177	struct tgsi_shader_info			info;
178	struct tgsi_parse_context		parse;
179	const struct tgsi_token			*tokens;
180	unsigned				type;
181	unsigned				file_offset[TGSI_FILE_COUNT];
182	unsigned				temp_reg;
183	struct r600_shader_tgsi_instruction	*inst_info;
184	struct r600_bytecode			*bc;
185	struct r600_shader			*shader;
186	struct r600_shader_src			src[4];
187	uint32_t				*literals;
188	uint32_t				nliterals;
189	uint32_t				max_driver_temp_used;
190	/* needed for evergreen interpolation */
191	boolean                                 input_centroid;
192	boolean                                 input_linear;
193	boolean                                 input_perspective;
194	int					num_interp_gpr;
195	int					face_gpr;
196	int					colors_used;
197	boolean                 clip_vertex_write;
198	unsigned                cv_output;
199	int					fragcoord_input;
200	int					native_integers;
201};
202
203struct r600_shader_tgsi_instruction {
204	unsigned	tgsi_opcode;
205	unsigned	is_op3;
206	unsigned	r600_opcode;
207	int (*process)(struct r600_shader_ctx *ctx);
208};
209
210static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
211static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
212static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
213static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
214static int tgsi_else(struct r600_shader_ctx *ctx);
215static int tgsi_endif(struct r600_shader_ctx *ctx);
216static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
217static int tgsi_endloop(struct r600_shader_ctx *ctx);
218static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
219
220/*
221 * bytestream -> r600 shader
222 *
223 * These functions are used to transform the output of the LLVM backend into
224 * struct r600_bytecode.
225 */
226
227static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
228				unsigned char * bytes,	unsigned num_bytes);
229
230#ifdef HAVE_OPENCL
231int r600_compute_shader_create(struct pipe_context * ctx,
232	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
233{
234	struct r600_context *r600_ctx = (struct r600_context *)ctx;
235	unsigned char * bytes;
236	unsigned byte_count;
237	struct r600_shader_ctx shader_ctx;
238	unsigned dump = 0;
239
240	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
241		dump = 1;
242	}
243
244	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
245	shader_ctx.bc = bytecode;
246	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
247	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
248	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
249	if (shader_ctx.bc->chip_class == CAYMAN) {
250		cm_bytecode_add_cf_end(shader_ctx.bc);
251	}
252	r600_bytecode_build(shader_ctx.bc);
253	if (dump) {
254		r600_bytecode_dump(shader_ctx.bc);
255	}
256	return 1;
257}
258
259#endif /* HAVE_OPENCL */
260
261static uint32_t i32_from_byte_stream(unsigned char * bytes,
262		unsigned * bytes_read)
263{
264	unsigned i;
265	uint32_t out = 0;
266	for (i = 0; i < 4; i++) {
267		out |= bytes[(*bytes_read)++] << (8 * i);
268	}
269	return out;
270}
271
272static unsigned r600_src_from_byte_stream(unsigned char * bytes,
273		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
274{
275	unsigned i;
276	unsigned sel0, sel1;
277	sel0 = bytes[bytes_read++];
278	sel1 = bytes[bytes_read++];
279	alu->src[src_idx].sel = sel0 | (sel1 << 8);
280	alu->src[src_idx].chan = bytes[bytes_read++];
281	alu->src[src_idx].neg = bytes[bytes_read++];
282	alu->src[src_idx].abs = bytes[bytes_read++];
283	alu->src[src_idx].rel = bytes[bytes_read++];
284	alu->src[src_idx].kc_bank = bytes[bytes_read++];
285	for (i = 0; i < 4; i++) {
286		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
287	}
288	return bytes_read;
289}
290
291static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
292				unsigned char * bytes, unsigned bytes_read)
293{
294	unsigned src_idx;
295	unsigned inst0, inst1;
296	struct r600_bytecode_alu alu;
297	memset(&alu, 0, sizeof(alu));
298	for(src_idx = 0; src_idx < 3; src_idx++) {
299		bytes_read = r600_src_from_byte_stream(bytes, bytes_read,
300								&alu, src_idx);
301	}
302
303	alu.dst.sel = bytes[bytes_read++];
304	alu.dst.chan = bytes[bytes_read++];
305	alu.dst.clamp = bytes[bytes_read++];
306	alu.dst.write = bytes[bytes_read++];
307	alu.dst.rel = bytes[bytes_read++];
308	inst0 = bytes[bytes_read++];
309	inst1 = bytes[bytes_read++];
310	alu.inst = inst0 | (inst1 << 8);
311	alu.last = bytes[bytes_read++];
312	alu.is_op3 = bytes[bytes_read++];
313	alu.pred_sel = bytes[bytes_read++];
314	alu.bank_swizzle = bytes[bytes_read++];
315	alu.bank_swizzle_force = bytes[bytes_read++];
316	alu.omod = bytes[bytes_read++];
317	alu.index_mode = bytes[bytes_read++];
318	r600_bytecode_add_alu(ctx->bc, &alu);
319
320	/* XXX: Handle other KILL instructions */
321	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
322		ctx->shader->uses_kill = 1;
323		/* XXX: This should be enforced in the LLVM backend. */
324		ctx->bc->force_add_cf = 1;
325	}
326	return bytes_read;
327}
328
329static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
330	unsigned pred_inst)
331{
332	alu->inst = pred_inst;
333	alu->execute_mask = 1;
334	alu->update_pred = 1;
335	alu->dst.write = 0;
336	alu->src[1].sel = V_SQ_ALU_SRC_0;
337	alu->src[1].chan = 0;
338	alu->last = 1;
339	r600_bytecode_add_alu_type(ctx->bc, alu,
340		CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
341
342	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
343	fc_pushlevel(ctx, FC_IF);
344	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
345}
346
347static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
348			struct r600_bytecode_alu *alu, unsigned compare_opcode)
349{
350	unsigned opcode = TGSI_OPCODE_BRK;
351	if (ctx->bc->chip_class == CAYMAN)
352		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
353	else if (ctx->bc->chip_class >= EVERGREEN)
354		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
355	else
356		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
357	llvm_if(ctx, alu, compare_opcode);
358	tgsi_loop_brk_cont(ctx);
359	tgsi_endif(ctx);
360}
361
362static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
363				unsigned char * bytes, unsigned bytes_read)
364{
365	struct r600_bytecode_alu alu;
366	unsigned inst;
367	memset(&alu, 0, sizeof(alu));
368	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
369	inst = bytes[bytes_read++];
370	switch (inst) {
371	case 0: /* FC_IF */
372		llvm_if(ctx, &alu,
373			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
374		break;
375	case 1: /* FC_IF_INT */
376		llvm_if(ctx, &alu,
377			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
378		break;
379	case 2: /* FC_ELSE */
380		tgsi_else(ctx);
381		break;
382	case 3: /* FC_ENDIF */
383		tgsi_endif(ctx);
384		break;
385	case 4: /* FC_BGNLOOP */
386		tgsi_bgnloop(ctx);
387		break;
388	case 5: /* FC_ENDLOOP */
389		tgsi_endloop(ctx);
390		break;
391	case 6: /* FC_BREAK */
392		r600_break_from_byte_stream(ctx, &alu,
393			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
394		break;
395	case 7: /* FC_BREAK_NZ_INT */
396		r600_break_from_byte_stream(ctx, &alu,
397			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
398		break;
399	case 8: /* FC_CONTINUE */
400		{
401			unsigned opcode = TGSI_OPCODE_CONT;
402			if (ctx->bc->chip_class == CAYMAN) {
403				ctx->inst_info =
404					&cm_shader_tgsi_instruction[opcode];
405			} else if (ctx->bc->chip_class >= EVERGREEN) {
406				ctx->inst_info =
407					&eg_shader_tgsi_instruction[opcode];
408			} else {
409				ctx->inst_info =
410					&r600_shader_tgsi_instruction[opcode];
411			}
412			tgsi_loop_brk_cont(ctx);
413		}
414		break;
415	case 9: /* FC_BREAK_Z_INT */
416		r600_break_from_byte_stream(ctx, &alu,
417			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
418		break;
419	case 10: /* FC_BREAK_NZ */
420		r600_break_from_byte_stream(ctx, &alu,
421			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
422		break;
423	}
424
425	return bytes_read;
426}
427
428static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
429				unsigned char * bytes, unsigned bytes_read)
430{
431	struct r600_bytecode_tex tex;
432
433	tex.inst = bytes[bytes_read++];
434	tex.resource_id = bytes[bytes_read++];
435	tex.src_gpr = bytes[bytes_read++];
436	tex.src_rel = bytes[bytes_read++];
437	tex.dst_gpr = bytes[bytes_read++];
438	tex.dst_rel = bytes[bytes_read++];
439	tex.dst_sel_x = bytes[bytes_read++];
440	tex.dst_sel_y = bytes[bytes_read++];
441	tex.dst_sel_z = bytes[bytes_read++];
442	tex.dst_sel_w = bytes[bytes_read++];
443	tex.lod_bias = bytes[bytes_read++];
444	tex.coord_type_x = bytes[bytes_read++];
445	tex.coord_type_y = bytes[bytes_read++];
446	tex.coord_type_z = bytes[bytes_read++];
447	tex.coord_type_w = bytes[bytes_read++];
448	tex.offset_x = bytes[bytes_read++];
449	tex.offset_y = bytes[bytes_read++];
450	tex.offset_z = bytes[bytes_read++];
451	tex.sampler_id = bytes[bytes_read++];
452	tex.src_sel_x = bytes[bytes_read++];
453	tex.src_sel_y = bytes[bytes_read++];
454	tex.src_sel_z = bytes[bytes_read++];
455	tex.src_sel_w = bytes[bytes_read++];
456
457	r600_bytecode_add_tex(ctx->bc, &tex);
458
459	return bytes_read;
460}
461
462static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
463	unsigned char * bytes, unsigned bytes_read)
464{
465	struct r600_bytecode_vtx vtx;
466
467	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
468        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
469	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
470
471	memset(&vtx, 0, sizeof(vtx));
472
473	/* WORD0 */
474	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
475	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
476	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
477	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
478	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
479	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
480
481	/* WORD1 */
482	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
483	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
484	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
485	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
486	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
487	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
488	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
489	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
490	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
491	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
492
493	/* WORD 2*/
494	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
495	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
496
497	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
498		fprintf(stderr, "Error adding vtx\n");
499	}
500	/* Use the Texture Cache */
501	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
502	return bytes_read;
503}
504
505static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
506				unsigned char * bytes,	unsigned num_bytes)
507{
508	unsigned bytes_read = 0;
509	unsigned i, byte;
510	while (bytes_read < num_bytes) {
511		char inst_type = bytes[bytes_read++];
512		switch (inst_type) {
513		case 0:
514			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
515								bytes_read);
516			break;
517		case 1:
518			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
519								bytes_read);
520			break;
521		case 2:
522			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
523								bytes_read);
524			break;
525		case 3:
526			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
527			for (i = 0; i < 2; i++) {
528				for (byte = 0 ; byte < 4; byte++) {
529					ctx->bc->cf_last->isa[i] |=
530					(bytes[bytes_read++] << (byte * 8));
531				}
532			}
533			break;
534
535		case 4:
536			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
537								bytes_read);
538			break;
539		default:
540			/* XXX: Error here */
541			break;
542		}
543	}
544}
545
546/* End bytestream -> r600 shader functions*/
547
548static int tgsi_is_supported(struct r600_shader_ctx *ctx)
549{
550	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
551	int j;
552
553	if (i->Instruction.NumDstRegs > 1) {
554		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
555		return -EINVAL;
556	}
557	if (i->Instruction.Predicate) {
558		R600_ERR("predicate unsupported\n");
559		return -EINVAL;
560	}
561#if 0
562	if (i->Instruction.Label) {
563		R600_ERR("label unsupported\n");
564		return -EINVAL;
565	}
566#endif
567	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
568		if (i->Src[j].Register.Dimension) {
569			R600_ERR("unsupported src %d (dimension %d)\n", j,
570				 i->Src[j].Register.Dimension);
571			return -EINVAL;
572		}
573	}
574	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
575		if (i->Dst[j].Register.Dimension) {
576			R600_ERR("unsupported dst (dimension)\n");
577			return -EINVAL;
578		}
579	}
580	return 0;
581}
582
583static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
584{
585	int i, r;
586	struct r600_bytecode_alu alu;
587	int gpr = 0, base_chan = 0;
588	int ij_index = 0;
589
590	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
591		ij_index = 0;
592		if (ctx->shader->input[input].centroid)
593			ij_index++;
594	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
595		ij_index = 0;
596		/* if we have perspective add one */
597		if (ctx->input_perspective)  {
598			ij_index++;
599			/* if we have perspective centroid */
600			if (ctx->input_centroid)
601				ij_index++;
602		}
603		if (ctx->shader->input[input].centroid)
604			ij_index++;
605	}
606
607	/* work out gpr and base_chan from index */
608	gpr = ij_index / 2;
609	base_chan = (2 * (ij_index % 2)) + 1;
610
611	for (i = 0; i < 8; i++) {
612		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
613
614		if (i < 4)
615			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
616		else
617			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
618
619		if ((i > 1) && (i < 6)) {
620			alu.dst.sel = ctx->shader->input[input].gpr;
621			alu.dst.write = 1;
622		}
623
624		alu.dst.chan = i % 4;
625
626		alu.src[0].sel = gpr;
627		alu.src[0].chan = (base_chan - (i % 2));
628
629		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
630
631		alu.bank_swizzle_force = SQ_ALU_VEC_210;
632		if ((i % 4) == 3)
633			alu.last = 1;
634		r = r600_bytecode_add_alu(ctx->bc, &alu);
635		if (r)
636			return r;
637	}
638	return 0;
639}
640
641static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
642{
643	int i, r;
644	struct r600_bytecode_alu alu;
645
646	for (i = 0; i < 4; i++) {
647		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
648
649		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
650
651		alu.dst.sel = ctx->shader->input[input].gpr;
652		alu.dst.write = 1;
653
654		alu.dst.chan = i;
655
656		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
657		alu.src[0].chan = i;
658
659		if (i == 3)
660			alu.last = 1;
661		r = r600_bytecode_add_alu(ctx->bc, &alu);
662		if (r)
663			return r;
664	}
665	return 0;
666}
667
668/*
669 * Special export handling in shaders
670 *
671 * shader export ARRAY_BASE for EXPORT_POS:
672 * 60 is position
673 * 61 is misc vector
674 * 62, 63 are clip distance vectors
675 *
676 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
677 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
678 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
679 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
680 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
681 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
682 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
683 * exclusive from render target index)
684 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
685 *
686 *
687 * shader export ARRAY_BASE for EXPORT_PIXEL:
688 * 0-7 CB targets
689 * 61 computed Z vector
690 *
691 * The use of the values exported in the computed Z vector are controlled
692 * by DB_SHADER_CONTROL:
693 * Z_EXPORT_ENABLE - Z as a float in RED
694 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
695 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
696 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
697 * DB_SOURCE_FORMAT - export control restrictions
698 *
699 */
700
701
702/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
703static int r600_spi_sid(struct r600_shader_io * io)
704{
705	int index, name = io->name;
706
707	/* These params are handled differently, they don't need
708	 * semantic indices, so we'll use 0 for them.
709	 */
710	if (name == TGSI_SEMANTIC_POSITION ||
711		name == TGSI_SEMANTIC_PSIZE ||
712		name == TGSI_SEMANTIC_FACE)
713		index = 0;
714	else {
715		if (name == TGSI_SEMANTIC_GENERIC) {
716			/* For generic params simply use sid from tgsi */
717			index = io->sid;
718		} else {
719			/* For non-generic params - pack name and sid into 8 bits */
720			index = 0x80 | (name<<3) | (io->sid);
721		}
722
723		/* Make sure that all really used indices have nonzero value, so
724		 * we can just compare it to 0 later instead of comparing the name
725		 * with different values to detect special cases. */
726		index++;
727	}
728
729	return index;
730};
731
732/* turn input into interpolate on EG */
733static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
734{
735	int r = 0;
736
737	if (ctx->shader->input[index].spi_sid) {
738		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
739		if (ctx->shader->input[index].interpolate > 0) {
740			r = evergreen_interp_alu(ctx, index);
741		} else {
742			r = evergreen_interp_flat(ctx, index);
743		}
744	}
745	return r;
746}
747
748static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
749{
750	struct r600_bytecode_alu alu;
751	int i, r;
752	int gpr_front = ctx->shader->input[front].gpr;
753	int gpr_back = ctx->shader->input[back].gpr;
754
755	for (i = 0; i < 4; i++) {
756		memset(&alu, 0, sizeof(alu));
757		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
758		alu.is_op3 = 1;
759		alu.dst.write = 1;
760		alu.dst.sel = gpr_front;
761		alu.src[0].sel = ctx->face_gpr;
762		alu.src[1].sel = gpr_front;
763		alu.src[2].sel = gpr_back;
764
765		alu.dst.chan = i;
766		alu.src[1].chan = i;
767		alu.src[2].chan = i;
768		alu.last = (i==3);
769
770		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
771			return r;
772	}
773
774	return 0;
775}
776
777static int tgsi_declaration(struct r600_shader_ctx *ctx)
778{
779	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
780	unsigned i;
781	int r;
782
783	switch (d->Declaration.File) {
784	case TGSI_FILE_INPUT:
785		i = ctx->shader->ninput++;
786		ctx->shader->input[i].name = d->Semantic.Name;
787		ctx->shader->input[i].sid = d->Semantic.Index;
788		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
789		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
790		ctx->shader->input[i].centroid = d->Interp.Centroid;
791		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
792		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
793			switch (ctx->shader->input[i].name) {
794			case TGSI_SEMANTIC_FACE:
795				ctx->face_gpr = ctx->shader->input[i].gpr;
796				break;
797			case TGSI_SEMANTIC_COLOR:
798				ctx->colors_used++;
799				break;
800			case TGSI_SEMANTIC_POSITION:
801				ctx->fragcoord_input = i;
802				break;
803			}
804			if (ctx->bc->chip_class >= EVERGREEN) {
805				if ((r = evergreen_interp_input(ctx, i)))
806					return r;
807			}
808		}
809		break;
810	case TGSI_FILE_OUTPUT:
811		i = ctx->shader->noutput++;
812		ctx->shader->output[i].name = d->Semantic.Name;
813		ctx->shader->output[i].sid = d->Semantic.Index;
814		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
815		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
816		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
817		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
818		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
819			switch (d->Semantic.Name) {
820			case TGSI_SEMANTIC_CLIPDIST:
821				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
822				break;
823			case TGSI_SEMANTIC_PSIZE:
824				ctx->shader->vs_out_misc_write = 1;
825				ctx->shader->vs_out_point_size = 1;
826				break;
827			case TGSI_SEMANTIC_CLIPVERTEX:
828				ctx->clip_vertex_write = TRUE;
829				ctx->cv_output = i;
830				break;
831			}
832		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
833			switch (d->Semantic.Name) {
834			case TGSI_SEMANTIC_COLOR:
835				ctx->shader->nr_ps_max_color_exports++;
836				break;
837			}
838		}
839		break;
840	case TGSI_FILE_CONSTANT:
841	case TGSI_FILE_TEMPORARY:
842	case TGSI_FILE_SAMPLER:
843	case TGSI_FILE_ADDRESS:
844		break;
845
846	case TGSI_FILE_SYSTEM_VALUE:
847		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
848			if (!ctx->native_integers) {
849				struct r600_bytecode_alu alu;
850				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
851
852				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
853				alu.src[0].sel = 0;
854				alu.src[0].chan = 3;
855
856				alu.dst.sel = 0;
857				alu.dst.chan = 3;
858				alu.dst.write = 1;
859				alu.last = 1;
860
861				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
862					return r;
863			}
864			break;
865		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
866			break;
867	default:
868		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
869		return -EINVAL;
870	}
871	return 0;
872}
873
874static int r600_get_temp(struct r600_shader_ctx *ctx)
875{
876	return ctx->temp_reg + ctx->max_driver_temp_used++;
877}
878
879/*
880 * for evergreen we need to scan the shader to find the number of GPRs we need to
881 * reserve for interpolation.
882 *
883 * we need to know if we are going to emit
884 * any centroid inputs
885 * if perspective and linear are required
886*/
887static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
888{
889	int i;
890	int num_baryc;
891
892	ctx->input_linear = FALSE;
893	ctx->input_perspective = FALSE;
894	ctx->input_centroid = FALSE;
895	ctx->num_interp_gpr = 1;
896
897	/* any centroid inputs */
898	for (i = 0; i < ctx->info.num_inputs; i++) {
899		/* skip position/face */
900		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
901		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
902			continue;
903		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
904			ctx->input_linear = TRUE;
905		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
906			ctx->input_perspective = TRUE;
907		if (ctx->info.input_centroid[i])
908			ctx->input_centroid = TRUE;
909	}
910
911	num_baryc = 0;
912	/* ignoring sample for now */
913	if (ctx->input_perspective)
914		num_baryc++;
915	if (ctx->input_linear)
916		num_baryc++;
917	if (ctx->input_centroid)
918		num_baryc *= 2;
919
920	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
921
922	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
923	return ctx->num_interp_gpr;
924}
925
926static void tgsi_src(struct r600_shader_ctx *ctx,
927		     const struct tgsi_full_src_register *tgsi_src,
928		     struct r600_shader_src *r600_src)
929{
930	memset(r600_src, 0, sizeof(*r600_src));
931	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
932	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
933	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
934	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
935	r600_src->neg = tgsi_src->Register.Negate;
936	r600_src->abs = tgsi_src->Register.Absolute;
937
938	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
939		int index;
940		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
941			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
942			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
943
944			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
945			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
946			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
947				return;
948		}
949		index = tgsi_src->Register.Index;
950		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
951		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
952	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
953		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
954			r600_src->swizzle[0] = 3;
955			r600_src->swizzle[1] = 3;
956			r600_src->swizzle[2] = 3;
957			r600_src->swizzle[3] = 3;
958			r600_src->sel = 0;
959		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
960			r600_src->swizzle[0] = 0;
961			r600_src->swizzle[1] = 0;
962			r600_src->swizzle[2] = 0;
963			r600_src->swizzle[3] = 0;
964			r600_src->sel = 0;
965		}
966	} else {
967		if (tgsi_src->Register.Indirect)
968			r600_src->rel = V_SQ_REL_RELATIVE;
969		r600_src->sel = tgsi_src->Register.Index;
970		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
971	}
972}
973
974static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
975{
976	struct r600_bytecode_vtx vtx;
977	unsigned int ar_reg;
978	int r;
979
980	if (offset) {
981		struct r600_bytecode_alu alu;
982
983		memset(&alu, 0, sizeof(alu));
984
985		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
986		alu.src[0].sel = ctx->bc->ar_reg;
987
988		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
989		alu.src[1].value = offset;
990
991		alu.dst.sel = dst_reg;
992		alu.dst.write = 1;
993		alu.last = 1;
994
995		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
996			return r;
997
998		ar_reg = dst_reg;
999	} else {
1000		ar_reg = ctx->bc->ar_reg;
1001	}
1002
1003	memset(&vtx, 0, sizeof(vtx));
1004	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1005	vtx.src_gpr = ar_reg;
1006	vtx.mega_fetch_count = 16;
1007	vtx.dst_gpr = dst_reg;
1008	vtx.dst_sel_x = 0;		/* SEL_X */
1009	vtx.dst_sel_y = 1;		/* SEL_Y */
1010	vtx.dst_sel_z = 2;		/* SEL_Z */
1011	vtx.dst_sel_w = 3;		/* SEL_W */
1012	vtx.data_format = FMT_32_32_32_32_FLOAT;
1013	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1014	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1015	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1016	vtx.endian = r600_endian_swap(32);
1017
1018	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1019		return r;
1020
1021	return 0;
1022}
1023
1024static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1025{
1026	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1027	struct r600_bytecode_alu alu;
1028	int i, j, k, nconst, r;
1029
1030	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1031		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1032			nconst++;
1033		}
1034		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1035	}
1036	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1037		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1038			continue;
1039		}
1040
1041		if (ctx->src[i].rel) {
1042			int treg = r600_get_temp(ctx);
1043			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1044				return r;
1045
1046			ctx->src[i].sel = treg;
1047			ctx->src[i].rel = 0;
1048			j--;
1049		} else if (j > 0) {
1050			int treg = r600_get_temp(ctx);
1051			for (k = 0; k < 4; k++) {
1052				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1053				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1054				alu.src[0].sel = ctx->src[i].sel;
1055				alu.src[0].chan = k;
1056				alu.src[0].rel = ctx->src[i].rel;
1057				alu.dst.sel = treg;
1058				alu.dst.chan = k;
1059				alu.dst.write = 1;
1060				if (k == 3)
1061					alu.last = 1;
1062				r = r600_bytecode_add_alu(ctx->bc, &alu);
1063				if (r)
1064					return r;
1065			}
1066			ctx->src[i].sel = treg;
1067			ctx->src[i].rel =0;
1068			j--;
1069		}
1070	}
1071	return 0;
1072}
1073
1074/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1075static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1076{
1077	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1078	struct r600_bytecode_alu alu;
1079	int i, j, k, nliteral, r;
1080
1081	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1082		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1083			nliteral++;
1084		}
1085	}
1086	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1087		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1088			int treg = r600_get_temp(ctx);
1089			for (k = 0; k < 4; k++) {
1090				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1091				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1092				alu.src[0].sel = ctx->src[i].sel;
1093				alu.src[0].chan = k;
1094				alu.src[0].value = ctx->src[i].value[k];
1095				alu.dst.sel = treg;
1096				alu.dst.chan = k;
1097				alu.dst.write = 1;
1098				if (k == 3)
1099					alu.last = 1;
1100				r = r600_bytecode_add_alu(ctx->bc, &alu);
1101				if (r)
1102					return r;
1103			}
1104			ctx->src[i].sel = treg;
1105			j--;
1106		}
1107	}
1108	return 0;
1109}
1110
1111static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1112{
1113	int i, r, count = ctx->shader->ninput;
1114
1115	/* additional inputs will be allocated right after the existing inputs,
1116	 * we won't need them after the color selection, so we don't need to
1117	 * reserve these gprs for the rest of the shader code and to adjust
1118	 * output offsets etc. */
1119	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
1120			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
1121
1122	if (ctx->face_gpr == -1) {
1123		i = ctx->shader->ninput++;
1124		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
1125		ctx->shader->input[i].spi_sid = 0;
1126		ctx->shader->input[i].gpr = gpr++;
1127		ctx->face_gpr = ctx->shader->input[i].gpr;
1128	}
1129
1130	for (i = 0; i < count; i++) {
1131		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1132			int ni = ctx->shader->ninput++;
1133			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
1134			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1135			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
1136			ctx->shader->input[ni].gpr = gpr++;
1137
1138			if (ctx->bc->chip_class >= EVERGREEN) {
1139				r = evergreen_interp_input(ctx, ni);
1140				if (r)
1141					return r;
1142			}
1143
1144			r = select_twoside_color(ctx, i, ni);
1145			if (r)
1146				return r;
1147		}
1148	}
1149	return 0;
1150}
1151
1152static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader)
1153{
1154	struct r600_shader *shader = &pipeshader->shader;
1155	struct tgsi_token *tokens = pipeshader->selector->tokens;
1156	struct pipe_stream_output_info so = pipeshader->selector->so;
1157	struct tgsi_full_immediate *immediate;
1158	struct tgsi_full_property *property;
1159	struct r600_shader_ctx ctx;
1160	struct r600_bytecode_output output[32];
1161	unsigned output_done, noutput;
1162	unsigned opcode;
1163	int i, j, k, r = 0;
1164	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1165	/* Declarations used by llvm code */
1166	bool use_llvm = false;
1167	unsigned char * inst_bytes = NULL;
1168	unsigned inst_byte_count = 0;
1169
1170#ifdef R600_USE_LLVM
1171	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1172#endif
1173	ctx.bc = &shader->bc;
1174	ctx.shader = shader;
1175	ctx.native_integers = true;
1176
1177	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
1178	ctx.tokens = tokens;
1179	tgsi_scan_shader(tokens, &ctx.info);
1180	tgsi_parse_init(&ctx.parse, tokens);
1181	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1182	shader->processor_type = ctx.type;
1183	ctx.bc->type = shader->processor_type;
1184
1185	ctx.face_gpr = -1;
1186	ctx.fragcoord_input = -1;
1187	ctx.colors_used = 0;
1188	ctx.clip_vertex_write = 0;
1189
1190	shader->nr_ps_color_exports = 0;
1191	shader->nr_ps_max_color_exports = 0;
1192
1193	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
1194
1195	/* register allocations */
1196	/* Values [0,127] correspond to GPR[0..127].
1197	 * Values [128,159] correspond to constant buffer bank 0
1198	 * Values [160,191] correspond to constant buffer bank 1
1199	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1200	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1201	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1202	 * Other special values are shown in the list below.
1203	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1204	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1205	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1206	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1207	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1208	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1209	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1210	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1211	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1212	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1213	 * 254	SQ_ALU_SRC_PV: previous vector result.
1214	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1215	 */
1216	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1217		ctx.file_offset[i] = 0;
1218	}
1219	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1220		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1221		if (ctx.bc->chip_class >= EVERGREEN) {
1222			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1223		} else {
1224			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1225		}
1226	}
1227	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1228		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1229	}
1230
1231	/* LLVM backend setup */
1232#ifdef R600_USE_LLVM
1233	if (use_llvm && ctx.info.indirect_files) {
1234		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1235				"indirect adressing.  Falling back to TGSI "
1236				"backend.\n");
1237		use_llvm = 0;
1238	}
1239	if (use_llvm) {
1240		struct radeon_llvm_context radeon_llvm_ctx;
1241		LLVMModuleRef mod;
1242		unsigned dump = 0;
1243		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1244		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1245		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1246		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1247			dump = 1;
1248		}
1249		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1250							rctx->family, dump)) {
1251			FREE(inst_bytes);
1252			radeon_llvm_dispose(&radeon_llvm_ctx);
1253			use_llvm = 0;
1254			fprintf(stderr, "R600 LLVM backend failed to compile "
1255				"shader.  Falling back to TGSI\n");
1256		} else {
1257			ctx.file_offset[TGSI_FILE_OUTPUT] =
1258					ctx.file_offset[TGSI_FILE_INPUT];
1259		}
1260		radeon_llvm_dispose(&radeon_llvm_ctx);
1261	}
1262#endif
1263	/* End of LLVM backend setup */
1264
1265	if (!use_llvm) {
1266		ctx.file_offset[TGSI_FILE_OUTPUT] =
1267			ctx.file_offset[TGSI_FILE_INPUT] +
1268			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1269	}
1270	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1271						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1272
1273	/* Outside the GPR range. This will be translated to one of the
1274	 * kcache banks later. */
1275	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1276
1277	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1278	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1279			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1280	ctx.temp_reg = ctx.bc->ar_reg + 1;
1281
1282	ctx.nliterals = 0;
1283	ctx.literals = NULL;
1284	shader->fs_write_all = FALSE;
1285	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1286		tgsi_parse_token(&ctx.parse);
1287		switch (ctx.parse.FullToken.Token.Type) {
1288		case TGSI_TOKEN_TYPE_IMMEDIATE:
1289			immediate = &ctx.parse.FullToken.FullImmediate;
1290			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1291			if(ctx.literals == NULL) {
1292				r = -ENOMEM;
1293				goto out_err;
1294			}
1295			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1296			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1297			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1298			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1299			ctx.nliterals++;
1300			break;
1301		case TGSI_TOKEN_TYPE_DECLARATION:
1302			r = tgsi_declaration(&ctx);
1303			if (r)
1304				goto out_err;
1305			break;
1306		case TGSI_TOKEN_TYPE_INSTRUCTION:
1307			break;
1308		case TGSI_TOKEN_TYPE_PROPERTY:
1309			property = &ctx.parse.FullToken.FullProperty;
1310			switch (property->Property.PropertyName) {
1311			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1312				if (property->u[0].Data == 1)
1313					shader->fs_write_all = TRUE;
1314				break;
1315			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1316				if (property->u[0].Data == 1)
1317					shader->vs_prohibit_ucps = TRUE;
1318				break;
1319			}
1320			break;
1321		default:
1322			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1323			r = -EINVAL;
1324			goto out_err;
1325		}
1326	}
1327
1328	if (shader->fs_write_all && rctx->chip_class >= EVERGREEN)
1329		shader->nr_ps_max_color_exports = 8;
1330
1331	if (ctx.fragcoord_input >= 0) {
1332		if (ctx.bc->chip_class == CAYMAN) {
1333			for (j = 0 ; j < 4; j++) {
1334				struct r600_bytecode_alu alu;
1335				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1336				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1337				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1338				alu.src[0].chan = 3;
1339
1340				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1341				alu.dst.chan = j;
1342				alu.dst.write = (j == 3);
1343				alu.last = 1;
1344				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1345					return r;
1346			}
1347		} else {
1348			struct r600_bytecode_alu alu;
1349			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1350			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1351			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1352			alu.src[0].chan = 3;
1353
1354			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1355			alu.dst.chan = 3;
1356			alu.dst.write = 1;
1357			alu.last = 1;
1358			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1359				return r;
1360		}
1361	}
1362
1363	if (shader->two_side && ctx.colors_used) {
1364		if ((r = process_twoside_color_inputs(&ctx)))
1365			return r;
1366	}
1367
1368	tgsi_parse_init(&ctx.parse, tokens);
1369	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1370		tgsi_parse_token(&ctx.parse);
1371		switch (ctx.parse.FullToken.Token.Type) {
1372		case TGSI_TOKEN_TYPE_INSTRUCTION:
1373			if (use_llvm) {
1374				continue;
1375			}
1376			r = tgsi_is_supported(&ctx);
1377			if (r)
1378				goto out_err;
1379			ctx.max_driver_temp_used = 0;
1380			/* reserve first tmp for everyone */
1381			r600_get_temp(&ctx);
1382
1383			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1384			if ((r = tgsi_split_constant(&ctx)))
1385				goto out_err;
1386			if ((r = tgsi_split_literal_constant(&ctx)))
1387				goto out_err;
1388			if (ctx.bc->chip_class == CAYMAN)
1389				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1390			else if (ctx.bc->chip_class >= EVERGREEN)
1391				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1392			else
1393				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1394			r = ctx.inst_info->process(&ctx);
1395			if (r)
1396				goto out_err;
1397			break;
1398		default:
1399			break;
1400		}
1401	}
1402
1403	/* Get instructions if we are using the LLVM backend. */
1404	if (use_llvm) {
1405		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1406		FREE(inst_bytes);
1407	}
1408
1409	noutput = shader->noutput;
1410
1411	if (ctx.clip_vertex_write) {
1412		/* need to convert a clipvertex write into clipdistance writes and not export
1413		   the clip vertex anymore */
1414
1415		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1416		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1417		shader->output[noutput].gpr = ctx.temp_reg;
1418		noutput++;
1419		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1420		shader->output[noutput].gpr = ctx.temp_reg+1;
1421		noutput++;
1422
1423		/* reset spi_sid for clipvertex output to avoid confusing spi */
1424		shader->output[ctx.cv_output].spi_sid = 0;
1425
1426		shader->clip_dist_write = 0xFF;
1427
1428		for (i = 0; i < 8; i++) {
1429			int oreg = i >> 2;
1430			int ochan = i & 3;
1431
1432			for (j = 0; j < 4; j++) {
1433				struct r600_bytecode_alu alu;
1434				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1435				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1436				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1437				alu.src[0].chan = j;
1438
1439				alu.src[1].sel = 512 + i;
1440				alu.src[1].kc_bank = 1;
1441				alu.src[1].chan = j;
1442
1443				alu.dst.sel = ctx.temp_reg + oreg;
1444				alu.dst.chan = j;
1445				alu.dst.write = (j == ochan);
1446				if (j == 3)
1447					alu.last = 1;
1448				r = r600_bytecode_add_alu(ctx.bc, &alu);
1449				if (r)
1450					return r;
1451			}
1452		}
1453	}
1454
1455	/* Add stream outputs. */
1456	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1457		for (i = 0; i < so.num_outputs; i++) {
1458			struct r600_bytecode_output output;
1459
1460			if (so.output[i].output_buffer >= 4) {
1461				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1462					 so.output[i].output_buffer);
1463				r = -EINVAL;
1464				goto out_err;
1465			}
1466			if (so.output[i].dst_offset < so.output[i].start_component) {
1467			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1468			   r = -EINVAL;
1469			   goto out_err;
1470			}
1471
1472			memset(&output, 0, sizeof(struct r600_bytecode_output));
1473			output.gpr = shader->output[so.output[i].register_index].gpr;
1474			output.elem_size = 0;
1475			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1476			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1477			output.burst_count = 1;
1478			output.barrier = 1;
1479			/* array_size is an upper limit for the burst_count
1480			 * with MEM_STREAM instructions */
1481			output.array_size = 0xFFF;
1482			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1483			if (ctx.bc->chip_class >= EVERGREEN) {
1484				switch (so.output[i].output_buffer) {
1485				case 0:
1486					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1487					break;
1488				case 1:
1489					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1490					break;
1491				case 2:
1492					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1493					break;
1494				case 3:
1495					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1496					break;
1497				}
1498			} else {
1499				switch (so.output[i].output_buffer) {
1500				case 0:
1501					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1502					break;
1503				case 1:
1504					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1505					break;
1506				case 2:
1507					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1508					break;
1509				case 3:
1510					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1511					break;
1512				}
1513			}
1514			r = r600_bytecode_add_output(ctx.bc, &output);
1515			if (r)
1516				goto out_err;
1517		}
1518	}
1519
1520	/* export output */
1521	for (i = 0, j = 0; i < noutput; i++, j++) {
1522		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1523		output[j].gpr = shader->output[i].gpr;
1524		output[j].elem_size = 3;
1525		output[j].swizzle_x = 0;
1526		output[j].swizzle_y = 1;
1527		output[j].swizzle_z = 2;
1528		output[j].swizzle_w = 3;
1529		output[j].burst_count = 1;
1530		output[j].barrier = 1;
1531		output[j].type = -1;
1532		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1533		switch (ctx.type) {
1534		case TGSI_PROCESSOR_VERTEX:
1535			switch (shader->output[i].name) {
1536			case TGSI_SEMANTIC_POSITION:
1537				output[j].array_base = next_pos_base++;
1538				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1539				break;
1540
1541			case TGSI_SEMANTIC_PSIZE:
1542				output[j].array_base = next_pos_base++;
1543				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1544				break;
1545			case TGSI_SEMANTIC_CLIPVERTEX:
1546				j--;
1547				break;
1548			case TGSI_SEMANTIC_CLIPDIST:
1549				output[j].array_base = next_pos_base++;
1550				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1551				/* spi_sid is 0 for clipdistance outputs that were generated
1552				 * for clipvertex - we don't need to pass them to PS */
1553				if (shader->output[i].spi_sid) {
1554					j++;
1555					/* duplicate it as PARAM to pass to the pixel shader */
1556					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1557					output[j].array_base = next_param_base++;
1558					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1559				}
1560				break;
1561			case TGSI_SEMANTIC_FOG:
1562				output[j].swizzle_y = 4; /* 0 */
1563				output[j].swizzle_z = 4; /* 0 */
1564				output[j].swizzle_w = 5; /* 1 */
1565				break;
1566			}
1567			break;
1568		case TGSI_PROCESSOR_FRAGMENT:
1569			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1570				/* never export more colors than the number of CBs */
1571				if (next_pixel_base && next_pixel_base >= (rctx->nr_cbufs + rctx->dual_src_blend * 1)) {
1572					/* skip export */
1573					j--;
1574					continue;
1575				}
1576				output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1577				output[j].array_base = next_pixel_base++;
1578				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1579				shader->nr_ps_color_exports++;
1580				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
1581					for (k = 1; k < rctx->nr_cbufs; k++) {
1582						j++;
1583						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1584						output[j].gpr = shader->output[i].gpr;
1585						output[j].elem_size = 3;
1586						output[j].swizzle_x = 0;
1587						output[j].swizzle_y = 1;
1588						output[j].swizzle_z = 2;
1589						output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1590						output[j].burst_count = 1;
1591						output[j].barrier = 1;
1592						output[j].array_base = next_pixel_base++;
1593						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1594						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1595						shader->nr_ps_color_exports++;
1596					}
1597				}
1598			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1599				output[j].array_base = 61;
1600				output[j].swizzle_x = 2;
1601				output[j].swizzle_y = 7;
1602				output[j].swizzle_z = output[j].swizzle_w = 7;
1603				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1604			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1605				output[j].array_base = 61;
1606				output[j].swizzle_x = 7;
1607				output[j].swizzle_y = 1;
1608				output[j].swizzle_z = output[j].swizzle_w = 7;
1609				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1610			} else {
1611				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1612				r = -EINVAL;
1613				goto out_err;
1614			}
1615			break;
1616		default:
1617			R600_ERR("unsupported processor type %d\n", ctx.type);
1618			r = -EINVAL;
1619			goto out_err;
1620		}
1621
1622		if (output[j].type==-1) {
1623			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1624			output[j].array_base = next_param_base++;
1625		}
1626	}
1627
1628	/* add fake param output for vertex shader if no param is exported */
1629	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1630			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1631			output[j].gpr = 0;
1632			output[j].elem_size = 3;
1633			output[j].swizzle_x = 7;
1634			output[j].swizzle_y = 7;
1635			output[j].swizzle_z = 7;
1636			output[j].swizzle_w = 7;
1637			output[j].burst_count = 1;
1638			output[j].barrier = 1;
1639			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1640			output[j].array_base = 0;
1641			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1642			j++;
1643	}
1644
1645	/* add fake pixel export */
1646	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1647		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1648		output[j].gpr = 0;
1649		output[j].elem_size = 3;
1650		output[j].swizzle_x = 7;
1651		output[j].swizzle_y = 7;
1652		output[j].swizzle_z = 7;
1653		output[j].swizzle_w = 7;
1654		output[j].burst_count = 1;
1655		output[j].barrier = 1;
1656		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1657		output[j].array_base = 0;
1658		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1659		j++;
1660	}
1661
1662	noutput = j;
1663
1664	/* set export done on last export of each type */
1665	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1666		if (ctx.bc->chip_class < CAYMAN) {
1667			if (i == (noutput - 1)) {
1668				output[i].end_of_program = 1;
1669			}
1670		}
1671		if (!(output_done & (1 << output[i].type))) {
1672			output_done |= (1 << output[i].type);
1673			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1674		}
1675	}
1676	/* add output to bytecode */
1677	for (i = 0; i < noutput; i++) {
1678		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1679		if (r)
1680			goto out_err;
1681	}
1682	/* add program end */
1683	if (ctx.bc->chip_class == CAYMAN)
1684		cm_bytecode_add_cf_end(ctx.bc);
1685
1686	/* check GPR limit - we have 124 = 128 - 4
1687	 * (4 are reserved as alu clause temporary registers) */
1688	if (ctx.bc->ngpr > 124) {
1689		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1690		r = -ENOMEM;
1691		goto out_err;
1692	}
1693
1694	free(ctx.literals);
1695	tgsi_parse_free(&ctx.parse);
1696	return 0;
1697out_err:
1698	free(ctx.literals);
1699	tgsi_parse_free(&ctx.parse);
1700	return r;
1701}
1702
1703static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1704{
1705	R600_ERR("%s tgsi opcode unsupported\n",
1706		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1707	return -EINVAL;
1708}
1709
1710static int tgsi_end(struct r600_shader_ctx *ctx)
1711{
1712	return 0;
1713}
1714
1715static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1716			const struct r600_shader_src *shader_src,
1717			unsigned chan)
1718{
1719	bc_src->sel = shader_src->sel;
1720	bc_src->chan = shader_src->swizzle[chan];
1721	bc_src->neg = shader_src->neg;
1722	bc_src->abs = shader_src->abs;
1723	bc_src->rel = shader_src->rel;
1724	bc_src->value = shader_src->value[bc_src->chan];
1725}
1726
1727static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1728{
1729	bc_src->abs = 1;
1730	bc_src->neg = 0;
1731}
1732
1733static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1734{
1735	bc_src->neg = !bc_src->neg;
1736}
1737
1738static void tgsi_dst(struct r600_shader_ctx *ctx,
1739		     const struct tgsi_full_dst_register *tgsi_dst,
1740		     unsigned swizzle,
1741		     struct r600_bytecode_alu_dst *r600_dst)
1742{
1743	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1744
1745	r600_dst->sel = tgsi_dst->Register.Index;
1746	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1747	r600_dst->chan = swizzle;
1748	r600_dst->write = 1;
1749	if (tgsi_dst->Register.Indirect)
1750		r600_dst->rel = V_SQ_REL_RELATIVE;
1751	if (inst->Instruction.Saturate) {
1752		r600_dst->clamp = 1;
1753	}
1754}
1755
1756static int tgsi_last_instruction(unsigned writemask)
1757{
1758	int i, lasti = 0;
1759
1760	for (i = 0; i < 4; i++) {
1761		if (writemask & (1 << i)) {
1762			lasti = i;
1763		}
1764	}
1765	return lasti;
1766}
1767
1768static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1769{
1770	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1771	struct r600_bytecode_alu alu;
1772	int i, j, r;
1773	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1774
1775	for (i = 0; i < lasti + 1; i++) {
1776		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1777			continue;
1778
1779		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1780		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1781
1782		alu.inst = ctx->inst_info->r600_opcode;
1783		if (!swap) {
1784			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1785				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1786			}
1787		} else {
1788			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1789			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1790		}
1791		/* handle some special cases */
1792		switch (ctx->inst_info->tgsi_opcode) {
1793		case TGSI_OPCODE_SUB:
1794			r600_bytecode_src_toggle_neg(&alu.src[1]);
1795			break;
1796		case TGSI_OPCODE_ABS:
1797			r600_bytecode_src_set_abs(&alu.src[0]);
1798			break;
1799		default:
1800			break;
1801		}
1802		if (i == lasti || trans_only) {
1803			alu.last = 1;
1804		}
1805		r = r600_bytecode_add_alu(ctx->bc, &alu);
1806		if (r)
1807			return r;
1808	}
1809	return 0;
1810}
1811
1812static int tgsi_op2(struct r600_shader_ctx *ctx)
1813{
1814	return tgsi_op2_s(ctx, 0, 0);
1815}
1816
1817static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1818{
1819	return tgsi_op2_s(ctx, 1, 0);
1820}
1821
1822static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1823{
1824	return tgsi_op2_s(ctx, 0, 1);
1825}
1826
1827static int tgsi_ineg(struct r600_shader_ctx *ctx)
1828{
1829	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1830	struct r600_bytecode_alu alu;
1831	int i, r;
1832	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1833
1834	for (i = 0; i < lasti + 1; i++) {
1835
1836		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1837			continue;
1838		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1839		alu.inst = ctx->inst_info->r600_opcode;
1840
1841		alu.src[0].sel = V_SQ_ALU_SRC_0;
1842
1843		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1844
1845		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1846
1847		if (i == lasti) {
1848			alu.last = 1;
1849		}
1850		r = r600_bytecode_add_alu(ctx->bc, &alu);
1851		if (r)
1852			return r;
1853	}
1854	return 0;
1855
1856}
1857
1858static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1859{
1860	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1861	int i, j, r;
1862	struct r600_bytecode_alu alu;
1863	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1864
1865	for (i = 0 ; i < last_slot; i++) {
1866		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1867		alu.inst = ctx->inst_info->r600_opcode;
1868		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1869			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1870		}
1871		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1872		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1873
1874		if (i == last_slot - 1)
1875			alu.last = 1;
1876		r = r600_bytecode_add_alu(ctx->bc, &alu);
1877		if (r)
1878			return r;
1879	}
1880	return 0;
1881}
1882
1883static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1884{
1885	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1886	int i, j, k, r;
1887	struct r600_bytecode_alu alu;
1888	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1889	for (k = 0; k < last_slot; k++) {
1890		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1891			continue;
1892
1893		for (i = 0 ; i < 4; i++) {
1894			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1895			alu.inst = ctx->inst_info->r600_opcode;
1896			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1897				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1898			}
1899			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1900			alu.dst.write = (i == k);
1901			if (i == 3)
1902				alu.last = 1;
1903			r = r600_bytecode_add_alu(ctx->bc, &alu);
1904			if (r)
1905				return r;
1906		}
1907	}
1908	return 0;
1909}
1910
1911/*
1912 * r600 - trunc to -PI..PI range
1913 * r700 - normalize by dividing by 2PI
1914 * see fdo bug 27901
1915 */
1916static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1917{
1918	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1919	static float double_pi = 3.1415926535 * 2;
1920	static float neg_pi = -3.1415926535;
1921
1922	int r;
1923	struct r600_bytecode_alu alu;
1924
1925	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1926	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1927	alu.is_op3 = 1;
1928
1929	alu.dst.chan = 0;
1930	alu.dst.sel = ctx->temp_reg;
1931	alu.dst.write = 1;
1932
1933	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1934
1935	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1936	alu.src[1].chan = 0;
1937	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1938	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1939	alu.src[2].chan = 0;
1940	alu.last = 1;
1941	r = r600_bytecode_add_alu(ctx->bc, &alu);
1942	if (r)
1943		return r;
1944
1945	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1946	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1947
1948	alu.dst.chan = 0;
1949	alu.dst.sel = ctx->temp_reg;
1950	alu.dst.write = 1;
1951
1952	alu.src[0].sel = ctx->temp_reg;
1953	alu.src[0].chan = 0;
1954	alu.last = 1;
1955	r = r600_bytecode_add_alu(ctx->bc, &alu);
1956	if (r)
1957		return r;
1958
1959	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1960	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1961	alu.is_op3 = 1;
1962
1963	alu.dst.chan = 0;
1964	alu.dst.sel = ctx->temp_reg;
1965	alu.dst.write = 1;
1966
1967	alu.src[0].sel = ctx->temp_reg;
1968	alu.src[0].chan = 0;
1969
1970	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1971	alu.src[1].chan = 0;
1972	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
1973	alu.src[2].chan = 0;
1974
1975	if (ctx->bc->chip_class == R600) {
1976		alu.src[1].value = *(uint32_t *)&double_pi;
1977		alu.src[2].value = *(uint32_t *)&neg_pi;
1978	} else {
1979		alu.src[1].sel = V_SQ_ALU_SRC_1;
1980		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1981		alu.src[2].neg = 1;
1982	}
1983
1984	alu.last = 1;
1985	r = r600_bytecode_add_alu(ctx->bc, &alu);
1986	if (r)
1987		return r;
1988	return 0;
1989}
1990
1991static int cayman_trig(struct r600_shader_ctx *ctx)
1992{
1993	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1994	struct r600_bytecode_alu alu;
1995	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1996	int i, r;
1997
1998	r = tgsi_setup_trig(ctx);
1999	if (r)
2000		return r;
2001
2002
2003	for (i = 0; i < last_slot; i++) {
2004		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2005		alu.inst = ctx->inst_info->r600_opcode;
2006		alu.dst.chan = i;
2007
2008		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2009		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2010
2011		alu.src[0].sel = ctx->temp_reg;
2012		alu.src[0].chan = 0;
2013		if (i == last_slot - 1)
2014			alu.last = 1;
2015		r = r600_bytecode_add_alu(ctx->bc, &alu);
2016		if (r)
2017			return r;
2018	}
2019	return 0;
2020}
2021
2022static int tgsi_trig(struct r600_shader_ctx *ctx)
2023{
2024	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2025	struct r600_bytecode_alu alu;
2026	int i, r;
2027	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2028
2029	r = tgsi_setup_trig(ctx);
2030	if (r)
2031		return r;
2032
2033	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2034	alu.inst = ctx->inst_info->r600_opcode;
2035	alu.dst.chan = 0;
2036	alu.dst.sel = ctx->temp_reg;
2037	alu.dst.write = 1;
2038
2039	alu.src[0].sel = ctx->temp_reg;
2040	alu.src[0].chan = 0;
2041	alu.last = 1;
2042	r = r600_bytecode_add_alu(ctx->bc, &alu);
2043	if (r)
2044		return r;
2045
2046	/* replicate result */
2047	for (i = 0; i < lasti + 1; i++) {
2048		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2049			continue;
2050
2051		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2052		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2053
2054		alu.src[0].sel = ctx->temp_reg;
2055		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2056		if (i == lasti)
2057			alu.last = 1;
2058		r = r600_bytecode_add_alu(ctx->bc, &alu);
2059		if (r)
2060			return r;
2061	}
2062	return 0;
2063}
2064
2065static int tgsi_scs(struct r600_shader_ctx *ctx)
2066{
2067	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2068	struct r600_bytecode_alu alu;
2069	int i, r;
2070
2071	/* We'll only need the trig stuff if we are going to write to the
2072	 * X or Y components of the destination vector.
2073	 */
2074	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2075		r = tgsi_setup_trig(ctx);
2076		if (r)
2077			return r;
2078	}
2079
2080	/* dst.x = COS */
2081	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2082		if (ctx->bc->chip_class == CAYMAN) {
2083			for (i = 0 ; i < 3; i++) {
2084				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2085				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2086				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2087
2088				if (i == 0)
2089					alu.dst.write = 1;
2090				else
2091					alu.dst.write = 0;
2092				alu.src[0].sel = ctx->temp_reg;
2093				alu.src[0].chan = 0;
2094				if (i == 2)
2095					alu.last = 1;
2096				r = r600_bytecode_add_alu(ctx->bc, &alu);
2097				if (r)
2098					return r;
2099			}
2100		} else {
2101			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2102			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2103			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2104
2105			alu.src[0].sel = ctx->temp_reg;
2106			alu.src[0].chan = 0;
2107			alu.last = 1;
2108			r = r600_bytecode_add_alu(ctx->bc, &alu);
2109			if (r)
2110				return r;
2111		}
2112	}
2113
2114	/* dst.y = SIN */
2115	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2116		if (ctx->bc->chip_class == CAYMAN) {
2117			for (i = 0 ; i < 3; i++) {
2118				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2119				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2120				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2121				if (i == 1)
2122					alu.dst.write = 1;
2123				else
2124					alu.dst.write = 0;
2125				alu.src[0].sel = ctx->temp_reg;
2126				alu.src[0].chan = 0;
2127				if (i == 2)
2128					alu.last = 1;
2129				r = r600_bytecode_add_alu(ctx->bc, &alu);
2130				if (r)
2131					return r;
2132			}
2133		} else {
2134			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2135			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2136			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2137
2138			alu.src[0].sel = ctx->temp_reg;
2139			alu.src[0].chan = 0;
2140			alu.last = 1;
2141			r = r600_bytecode_add_alu(ctx->bc, &alu);
2142			if (r)
2143				return r;
2144		}
2145	}
2146
2147	/* dst.z = 0.0; */
2148	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2149		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2150
2151		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2152
2153		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2154
2155		alu.src[0].sel = V_SQ_ALU_SRC_0;
2156		alu.src[0].chan = 0;
2157
2158		alu.last = 1;
2159
2160		r = r600_bytecode_add_alu(ctx->bc, &alu);
2161		if (r)
2162			return r;
2163	}
2164
2165	/* dst.w = 1.0; */
2166	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2167		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2168
2169		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2170
2171		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2172
2173		alu.src[0].sel = V_SQ_ALU_SRC_1;
2174		alu.src[0].chan = 0;
2175
2176		alu.last = 1;
2177
2178		r = r600_bytecode_add_alu(ctx->bc, &alu);
2179		if (r)
2180			return r;
2181	}
2182
2183	return 0;
2184}
2185
2186static int tgsi_kill(struct r600_shader_ctx *ctx)
2187{
2188	struct r600_bytecode_alu alu;
2189	int i, r;
2190
2191	for (i = 0; i < 4; i++) {
2192		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2193		alu.inst = ctx->inst_info->r600_opcode;
2194
2195		alu.dst.chan = i;
2196
2197		alu.src[0].sel = V_SQ_ALU_SRC_0;
2198
2199		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2200			alu.src[1].sel = V_SQ_ALU_SRC_1;
2201			alu.src[1].neg = 1;
2202		} else {
2203			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2204		}
2205		if (i == 3) {
2206			alu.last = 1;
2207		}
2208		r = r600_bytecode_add_alu(ctx->bc, &alu);
2209		if (r)
2210			return r;
2211	}
2212
2213	/* kill must be last in ALU */
2214	ctx->bc->force_add_cf = 1;
2215	ctx->shader->uses_kill = TRUE;
2216	return 0;
2217}
2218
2219static int tgsi_lit(struct r600_shader_ctx *ctx)
2220{
2221	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2222	struct r600_bytecode_alu alu;
2223	int r;
2224
2225	/* tmp.x = max(src.y, 0.0) */
2226	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2227	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2228	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2229	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2230	alu.src[1].chan = 1;
2231
2232	alu.dst.sel = ctx->temp_reg;
2233	alu.dst.chan = 0;
2234	alu.dst.write = 1;
2235
2236	alu.last = 1;
2237	r = r600_bytecode_add_alu(ctx->bc, &alu);
2238	if (r)
2239		return r;
2240
2241	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2242	{
2243		int chan;
2244		int sel;
2245		int i;
2246
2247		if (ctx->bc->chip_class == CAYMAN) {
2248			for (i = 0; i < 3; i++) {
2249				/* tmp.z = log(tmp.x) */
2250				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2251				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2252				alu.src[0].sel = ctx->temp_reg;
2253				alu.src[0].chan = 0;
2254				alu.dst.sel = ctx->temp_reg;
2255				alu.dst.chan = i;
2256				if (i == 2) {
2257					alu.dst.write = 1;
2258					alu.last = 1;
2259				} else
2260					alu.dst.write = 0;
2261
2262				r = r600_bytecode_add_alu(ctx->bc, &alu);
2263				if (r)
2264					return r;
2265			}
2266		} else {
2267			/* tmp.z = log(tmp.x) */
2268			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2269			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2270			alu.src[0].sel = ctx->temp_reg;
2271			alu.src[0].chan = 0;
2272			alu.dst.sel = ctx->temp_reg;
2273			alu.dst.chan = 2;
2274			alu.dst.write = 1;
2275			alu.last = 1;
2276			r = r600_bytecode_add_alu(ctx->bc, &alu);
2277			if (r)
2278				return r;
2279		}
2280
2281		chan = alu.dst.chan;
2282		sel = alu.dst.sel;
2283
2284		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2285		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2286		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2287		alu.src[0].sel  = sel;
2288		alu.src[0].chan = chan;
2289		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2290		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2291		alu.dst.sel = ctx->temp_reg;
2292		alu.dst.chan = 0;
2293		alu.dst.write = 1;
2294		alu.is_op3 = 1;
2295		alu.last = 1;
2296		r = r600_bytecode_add_alu(ctx->bc, &alu);
2297		if (r)
2298			return r;
2299
2300		if (ctx->bc->chip_class == CAYMAN) {
2301			for (i = 0; i < 3; i++) {
2302				/* dst.z = exp(tmp.x) */
2303				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2304				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2305				alu.src[0].sel = ctx->temp_reg;
2306				alu.src[0].chan = 0;
2307				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2308				if (i == 2) {
2309					alu.dst.write = 1;
2310					alu.last = 1;
2311				} else
2312					alu.dst.write = 0;
2313				r = r600_bytecode_add_alu(ctx->bc, &alu);
2314				if (r)
2315					return r;
2316			}
2317		} else {
2318			/* dst.z = exp(tmp.x) */
2319			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2320			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2321			alu.src[0].sel = ctx->temp_reg;
2322			alu.src[0].chan = 0;
2323			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2324			alu.last = 1;
2325			r = r600_bytecode_add_alu(ctx->bc, &alu);
2326			if (r)
2327				return r;
2328		}
2329	}
2330
2331	/* dst.x, <- 1.0  */
2332	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2333	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2334	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2335	alu.src[0].chan = 0;
2336	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2337	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2338	r = r600_bytecode_add_alu(ctx->bc, &alu);
2339	if (r)
2340		return r;
2341
2342	/* dst.y = max(src.x, 0.0) */
2343	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2344	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2345	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2346	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2347	alu.src[1].chan = 0;
2348	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2349	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2350	r = r600_bytecode_add_alu(ctx->bc, &alu);
2351	if (r)
2352		return r;
2353
2354	/* dst.w, <- 1.0  */
2355	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2356	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2357	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2358	alu.src[0].chan = 0;
2359	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2360	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2361	alu.last = 1;
2362	r = r600_bytecode_add_alu(ctx->bc, &alu);
2363	if (r)
2364		return r;
2365
2366	return 0;
2367}
2368
2369static int tgsi_rsq(struct r600_shader_ctx *ctx)
2370{
2371	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2372	struct r600_bytecode_alu alu;
2373	int i, r;
2374
2375	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2376
2377	/* XXX:
2378	 * For state trackers other than OpenGL, we'll want to use
2379	 * _RECIPSQRT_IEEE instead.
2380	 */
2381	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2382
2383	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2384		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2385		r600_bytecode_src_set_abs(&alu.src[i]);
2386	}
2387	alu.dst.sel = ctx->temp_reg;
2388	alu.dst.write = 1;
2389	alu.last = 1;
2390	r = r600_bytecode_add_alu(ctx->bc, &alu);
2391	if (r)
2392		return r;
2393	/* replicate result */
2394	return tgsi_helper_tempx_replicate(ctx);
2395}
2396
2397static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2398{
2399	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2400	struct r600_bytecode_alu alu;
2401	int i, r;
2402
2403	for (i = 0; i < 4; i++) {
2404		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2405		alu.src[0].sel = ctx->temp_reg;
2406		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2407		alu.dst.chan = i;
2408		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2409		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2410		if (i == 3)
2411			alu.last = 1;
2412		r = r600_bytecode_add_alu(ctx->bc, &alu);
2413		if (r)
2414			return r;
2415	}
2416	return 0;
2417}
2418
2419static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2420{
2421	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2422	struct r600_bytecode_alu alu;
2423	int i, r;
2424
2425	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2426	alu.inst = ctx->inst_info->r600_opcode;
2427	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2428		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2429	}
2430	alu.dst.sel = ctx->temp_reg;
2431	alu.dst.write = 1;
2432	alu.last = 1;
2433	r = r600_bytecode_add_alu(ctx->bc, &alu);
2434	if (r)
2435		return r;
2436	/* replicate result */
2437	return tgsi_helper_tempx_replicate(ctx);
2438}
2439
2440static int cayman_pow(struct r600_shader_ctx *ctx)
2441{
2442	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2443	int i, r;
2444	struct r600_bytecode_alu alu;
2445	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2446
2447	for (i = 0; i < 3; i++) {
2448		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2449		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2450		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2451		alu.dst.sel = ctx->temp_reg;
2452		alu.dst.chan = i;
2453		alu.dst.write = 1;
2454		if (i == 2)
2455			alu.last = 1;
2456		r = r600_bytecode_add_alu(ctx->bc, &alu);
2457		if (r)
2458			return r;
2459	}
2460
2461	/* b * LOG2(a) */
2462	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2463	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2464	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2465	alu.src[1].sel = ctx->temp_reg;
2466	alu.dst.sel = ctx->temp_reg;
2467	alu.dst.write = 1;
2468	alu.last = 1;
2469	r = r600_bytecode_add_alu(ctx->bc, &alu);
2470	if (r)
2471		return r;
2472
2473	for (i = 0; i < last_slot; i++) {
2474		/* POW(a,b) = EXP2(b * LOG2(a))*/
2475		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2476		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2477		alu.src[0].sel = ctx->temp_reg;
2478
2479		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2480		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2481		if (i == last_slot - 1)
2482			alu.last = 1;
2483		r = r600_bytecode_add_alu(ctx->bc, &alu);
2484		if (r)
2485			return r;
2486	}
2487	return 0;
2488}
2489
2490static int tgsi_pow(struct r600_shader_ctx *ctx)
2491{
2492	struct r600_bytecode_alu alu;
2493	int r;
2494
2495	/* LOG2(a) */
2496	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2497	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2498	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2499	alu.dst.sel = ctx->temp_reg;
2500	alu.dst.write = 1;
2501	alu.last = 1;
2502	r = r600_bytecode_add_alu(ctx->bc, &alu);
2503	if (r)
2504		return r;
2505	/* b * LOG2(a) */
2506	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2507	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2508	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2509	alu.src[1].sel = ctx->temp_reg;
2510	alu.dst.sel = ctx->temp_reg;
2511	alu.dst.write = 1;
2512	alu.last = 1;
2513	r = r600_bytecode_add_alu(ctx->bc, &alu);
2514	if (r)
2515		return r;
2516	/* POW(a,b) = EXP2(b * LOG2(a))*/
2517	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2518	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2519	alu.src[0].sel = ctx->temp_reg;
2520	alu.dst.sel = ctx->temp_reg;
2521	alu.dst.write = 1;
2522	alu.last = 1;
2523	r = r600_bytecode_add_alu(ctx->bc, &alu);
2524	if (r)
2525		return r;
2526	return tgsi_helper_tempx_replicate(ctx);
2527}
2528
2529static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2530{
2531	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2532	struct r600_bytecode_alu alu;
2533	int i, r, j;
2534	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2535	int tmp0 = ctx->temp_reg;
2536	int tmp1 = r600_get_temp(ctx);
2537	int tmp2 = r600_get_temp(ctx);
2538	int tmp3 = r600_get_temp(ctx);
2539	/* Unsigned path:
2540	 *
2541	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2542	 *
2543	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2544	 * 2. tmp0.z = lo (tmp0.x * src2)
2545	 * 3. tmp0.w = -tmp0.z
2546	 * 4. tmp0.y = hi (tmp0.x * src2)
2547	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2548	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2549	 * 7. tmp1.x = tmp0.x - tmp0.w
2550	 * 8. tmp1.y = tmp0.x + tmp0.w
2551	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2552	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2553	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2554	 *
2555	 * 12. tmp0.w = src1 - tmp0.y       = r
2556	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2557	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2558	 *
2559	 * if DIV
2560	 *
2561	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2562	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2563	 *
2564	 * else MOD
2565	 *
2566	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2567	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2568	 *
2569	 * endif
2570	 *
2571	 * 17. tmp1.x = tmp1.x & tmp1.y
2572	 *
2573	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2574	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2575	 *
2576	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2577	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2578	 *
2579	 * Signed path:
2580	 *
2581	 * Same as unsigned, using abs values of the operands,
2582	 * and fixing the sign of the result in the end.
2583	 */
2584
2585	for (i = 0; i < 4; i++) {
2586		if (!(write_mask & (1<<i)))
2587			continue;
2588
2589		if (signed_op) {
2590
2591			/* tmp2.x = -src0 */
2592			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2593			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2594
2595			alu.dst.sel = tmp2;
2596			alu.dst.chan = 0;
2597			alu.dst.write = 1;
2598
2599			alu.src[0].sel = V_SQ_ALU_SRC_0;
2600
2601			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2602
2603			alu.last = 1;
2604			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2605				return r;
2606
2607			/* tmp2.y = -src1 */
2608			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2609			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2610
2611			alu.dst.sel = tmp2;
2612			alu.dst.chan = 1;
2613			alu.dst.write = 1;
2614
2615			alu.src[0].sel = V_SQ_ALU_SRC_0;
2616
2617			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2618
2619			alu.last = 1;
2620			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2621				return r;
2622
2623			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2624			/* it will be a sign of the quotient */
2625			if (!mod) {
2626
2627				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2628				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2629
2630				alu.dst.sel = tmp2;
2631				alu.dst.chan = 2;
2632				alu.dst.write = 1;
2633
2634				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2635				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2636
2637				alu.last = 1;
2638				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2639					return r;
2640			}
2641
2642			/* tmp2.x = |src0| */
2643			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2644			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2645			alu.is_op3 = 1;
2646
2647			alu.dst.sel = tmp2;
2648			alu.dst.chan = 0;
2649			alu.dst.write = 1;
2650
2651			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2652			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2653			alu.src[2].sel = tmp2;
2654			alu.src[2].chan = 0;
2655
2656			alu.last = 1;
2657			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2658				return r;
2659
2660			/* tmp2.y = |src1| */
2661			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2662			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2663			alu.is_op3 = 1;
2664
2665			alu.dst.sel = tmp2;
2666			alu.dst.chan = 1;
2667			alu.dst.write = 1;
2668
2669			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2670			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2671			alu.src[2].sel = tmp2;
2672			alu.src[2].chan = 1;
2673
2674			alu.last = 1;
2675			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2676				return r;
2677
2678		}
2679
2680		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2681		if (ctx->bc->chip_class == CAYMAN) {
2682			/* tmp3.x = u2f(src2) */
2683			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2684			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2685
2686			alu.dst.sel = tmp3;
2687			alu.dst.chan = 0;
2688			alu.dst.write = 1;
2689
2690			if (signed_op) {
2691				alu.src[0].sel = tmp2;
2692				alu.src[0].chan = 1;
2693			} else {
2694				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2695			}
2696
2697			alu.last = 1;
2698			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2699				return r;
2700
2701			/* tmp0.x = recip(tmp3.x) */
2702			for (j = 0 ; j < 3; j++) {
2703				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2704				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2705
2706				alu.dst.sel = tmp0;
2707				alu.dst.chan = j;
2708				alu.dst.write = (j == 0);
2709
2710				alu.src[0].sel = tmp3;
2711				alu.src[0].chan = 0;
2712
2713				if (j == 2)
2714					alu.last = 1;
2715				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2716					return r;
2717			}
2718
2719			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2720			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2721
2722			alu.src[0].sel = tmp0;
2723			alu.src[0].chan = 0;
2724
2725			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2726			alu.src[1].value = 0x4f800000;
2727
2728			alu.dst.sel = tmp3;
2729			alu.dst.write = 1;
2730			alu.last = 1;
2731			r = r600_bytecode_add_alu(ctx->bc, &alu);
2732			if (r)
2733				return r;
2734
2735			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2736			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2737
2738			alu.dst.sel = tmp0;
2739			alu.dst.chan = 0;
2740			alu.dst.write = 1;
2741
2742			alu.src[0].sel = tmp3;
2743			alu.src[0].chan = 0;
2744
2745			alu.last = 1;
2746			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2747				return r;
2748
2749		} else {
2750			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2751			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2752
2753			alu.dst.sel = tmp0;
2754			alu.dst.chan = 0;
2755			alu.dst.write = 1;
2756
2757			if (signed_op) {
2758				alu.src[0].sel = tmp2;
2759				alu.src[0].chan = 1;
2760			} else {
2761				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2762			}
2763
2764			alu.last = 1;
2765			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2766				return r;
2767		}
2768
2769		/* 2. tmp0.z = lo (tmp0.x * src2) */
2770		if (ctx->bc->chip_class == CAYMAN) {
2771			for (j = 0 ; j < 4; j++) {
2772				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2773				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2774
2775				alu.dst.sel = tmp0;
2776				alu.dst.chan = j;
2777				alu.dst.write = (j == 2);
2778
2779				alu.src[0].sel = tmp0;
2780				alu.src[0].chan = 0;
2781				if (signed_op) {
2782					alu.src[1].sel = tmp2;
2783					alu.src[1].chan = 1;
2784				} else {
2785					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2786				}
2787
2788				alu.last = (j == 3);
2789				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2790					return r;
2791			}
2792		} else {
2793			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2794			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2795
2796			alu.dst.sel = tmp0;
2797			alu.dst.chan = 2;
2798			alu.dst.write = 1;
2799
2800			alu.src[0].sel = tmp0;
2801			alu.src[0].chan = 0;
2802			if (signed_op) {
2803				alu.src[1].sel = tmp2;
2804				alu.src[1].chan = 1;
2805			} else {
2806				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2807			}
2808
2809			alu.last = 1;
2810			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2811				return r;
2812		}
2813
2814		/* 3. tmp0.w = -tmp0.z */
2815		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2816		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2817
2818		alu.dst.sel = tmp0;
2819		alu.dst.chan = 3;
2820		alu.dst.write = 1;
2821
2822		alu.src[0].sel = V_SQ_ALU_SRC_0;
2823		alu.src[1].sel = tmp0;
2824		alu.src[1].chan = 2;
2825
2826		alu.last = 1;
2827		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2828			return r;
2829
2830		/* 4. tmp0.y = hi (tmp0.x * src2) */
2831		if (ctx->bc->chip_class == CAYMAN) {
2832			for (j = 0 ; j < 4; j++) {
2833				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2834				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2835
2836				alu.dst.sel = tmp0;
2837				alu.dst.chan = j;
2838				alu.dst.write = (j == 1);
2839
2840				alu.src[0].sel = tmp0;
2841				alu.src[0].chan = 0;
2842
2843				if (signed_op) {
2844					alu.src[1].sel = tmp2;
2845					alu.src[1].chan = 1;
2846				} else {
2847					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2848				}
2849				alu.last = (j == 3);
2850				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2851					return r;
2852			}
2853		} else {
2854			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2855			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2856
2857			alu.dst.sel = tmp0;
2858			alu.dst.chan = 1;
2859			alu.dst.write = 1;
2860
2861			alu.src[0].sel = tmp0;
2862			alu.src[0].chan = 0;
2863
2864			if (signed_op) {
2865				alu.src[1].sel = tmp2;
2866				alu.src[1].chan = 1;
2867			} else {
2868				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2869			}
2870
2871			alu.last = 1;
2872			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2873				return r;
2874		}
2875
2876		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2877		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2878		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2879		alu.is_op3 = 1;
2880
2881		alu.dst.sel = tmp0;
2882		alu.dst.chan = 2;
2883		alu.dst.write = 1;
2884
2885		alu.src[0].sel = tmp0;
2886		alu.src[0].chan = 1;
2887		alu.src[1].sel = tmp0;
2888		alu.src[1].chan = 3;
2889		alu.src[2].sel = tmp0;
2890		alu.src[2].chan = 2;
2891
2892		alu.last = 1;
2893		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2894			return r;
2895
2896		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2897		if (ctx->bc->chip_class == CAYMAN) {
2898			for (j = 0 ; j < 4; j++) {
2899				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2900				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2901
2902				alu.dst.sel = tmp0;
2903				alu.dst.chan = j;
2904				alu.dst.write = (j == 3);
2905
2906				alu.src[0].sel = tmp0;
2907				alu.src[0].chan = 2;
2908
2909				alu.src[1].sel = tmp0;
2910				alu.src[1].chan = 0;
2911
2912				alu.last = (j == 3);
2913				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2914					return r;
2915			}
2916		} else {
2917			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2918			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2919
2920			alu.dst.sel = tmp0;
2921			alu.dst.chan = 3;
2922			alu.dst.write = 1;
2923
2924			alu.src[0].sel = tmp0;
2925			alu.src[0].chan = 2;
2926
2927			alu.src[1].sel = tmp0;
2928			alu.src[1].chan = 0;
2929
2930			alu.last = 1;
2931			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2932				return r;
2933		}
2934
2935		/* 7. tmp1.x = tmp0.x - tmp0.w */
2936		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2937		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2938
2939		alu.dst.sel = tmp1;
2940		alu.dst.chan = 0;
2941		alu.dst.write = 1;
2942
2943		alu.src[0].sel = tmp0;
2944		alu.src[0].chan = 0;
2945		alu.src[1].sel = tmp0;
2946		alu.src[1].chan = 3;
2947
2948		alu.last = 1;
2949		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2950			return r;
2951
2952		/* 8. tmp1.y = tmp0.x + tmp0.w */
2953		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2954		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
2955
2956		alu.dst.sel = tmp1;
2957		alu.dst.chan = 1;
2958		alu.dst.write = 1;
2959
2960		alu.src[0].sel = tmp0;
2961		alu.src[0].chan = 0;
2962		alu.src[1].sel = tmp0;
2963		alu.src[1].chan = 3;
2964
2965		alu.last = 1;
2966		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2967			return r;
2968
2969		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
2970		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2971		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2972		alu.is_op3 = 1;
2973
2974		alu.dst.sel = tmp0;
2975		alu.dst.chan = 0;
2976		alu.dst.write = 1;
2977
2978		alu.src[0].sel = tmp0;
2979		alu.src[0].chan = 1;
2980		alu.src[1].sel = tmp1;
2981		alu.src[1].chan = 1;
2982		alu.src[2].sel = tmp1;
2983		alu.src[2].chan = 0;
2984
2985		alu.last = 1;
2986		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2987			return r;
2988
2989		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
2990		if (ctx->bc->chip_class == CAYMAN) {
2991			for (j = 0 ; j < 4; j++) {
2992				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2993				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2994
2995				alu.dst.sel = tmp0;
2996				alu.dst.chan = j;
2997				alu.dst.write = (j == 2);
2998
2999				alu.src[0].sel = tmp0;
3000				alu.src[0].chan = 0;
3001
3002				if (signed_op) {
3003					alu.src[1].sel = tmp2;
3004					alu.src[1].chan = 0;
3005				} else {
3006					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3007				}
3008
3009				alu.last = (j == 3);
3010				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3011					return r;
3012			}
3013		} else {
3014			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3015			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3016
3017			alu.dst.sel = tmp0;
3018			alu.dst.chan = 2;
3019			alu.dst.write = 1;
3020
3021			alu.src[0].sel = tmp0;
3022			alu.src[0].chan = 0;
3023
3024			if (signed_op) {
3025				alu.src[1].sel = tmp2;
3026				alu.src[1].chan = 0;
3027			} else {
3028				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3029			}
3030
3031			alu.last = 1;
3032			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3033				return r;
3034		}
3035
3036		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3037		if (ctx->bc->chip_class == CAYMAN) {
3038			for (j = 0 ; j < 4; j++) {
3039				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3040				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3041
3042				alu.dst.sel = tmp0;
3043				alu.dst.chan = j;
3044				alu.dst.write = (j == 1);
3045
3046				if (signed_op) {
3047					alu.src[0].sel = tmp2;
3048					alu.src[0].chan = 1;
3049				} else {
3050					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3051				}
3052
3053				alu.src[1].sel = tmp0;
3054				alu.src[1].chan = 2;
3055
3056				alu.last = (j == 3);
3057				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3058					return r;
3059			}
3060		} else {
3061			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3062			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3063
3064			alu.dst.sel = tmp0;
3065			alu.dst.chan = 1;
3066			alu.dst.write = 1;
3067
3068			if (signed_op) {
3069				alu.src[0].sel = tmp2;
3070				alu.src[0].chan = 1;
3071			} else {
3072				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3073			}
3074
3075			alu.src[1].sel = tmp0;
3076			alu.src[1].chan = 2;
3077
3078			alu.last = 1;
3079			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3080				return r;
3081		}
3082
3083		/* 12. tmp0.w = src1 - tmp0.y       = r */
3084		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3085		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3086
3087		alu.dst.sel = tmp0;
3088		alu.dst.chan = 3;
3089		alu.dst.write = 1;
3090
3091		if (signed_op) {
3092			alu.src[0].sel = tmp2;
3093			alu.src[0].chan = 0;
3094		} else {
3095			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3096		}
3097
3098		alu.src[1].sel = tmp0;
3099		alu.src[1].chan = 1;
3100
3101		alu.last = 1;
3102		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3103			return r;
3104
3105		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3106		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3107		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3108
3109		alu.dst.sel = tmp1;
3110		alu.dst.chan = 0;
3111		alu.dst.write = 1;
3112
3113		alu.src[0].sel = tmp0;
3114		alu.src[0].chan = 3;
3115		if (signed_op) {
3116			alu.src[1].sel = tmp2;
3117			alu.src[1].chan = 1;
3118		} else {
3119			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3120		}
3121
3122		alu.last = 1;
3123		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3124			return r;
3125
3126		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3127		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3128		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3129
3130		alu.dst.sel = tmp1;
3131		alu.dst.chan = 1;
3132		alu.dst.write = 1;
3133
3134		if (signed_op) {
3135			alu.src[0].sel = tmp2;
3136			alu.src[0].chan = 0;
3137		} else {
3138			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3139		}
3140
3141		alu.src[1].sel = tmp0;
3142		alu.src[1].chan = 1;
3143
3144		alu.last = 1;
3145		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3146			return r;
3147
3148		if (mod) { /* UMOD */
3149
3150			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3151			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3152			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3153
3154			alu.dst.sel = tmp1;
3155			alu.dst.chan = 2;
3156			alu.dst.write = 1;
3157
3158			alu.src[0].sel = tmp0;
3159			alu.src[0].chan = 3;
3160
3161			if (signed_op) {
3162				alu.src[1].sel = tmp2;
3163				alu.src[1].chan = 1;
3164			} else {
3165				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3166			}
3167
3168			alu.last = 1;
3169			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3170				return r;
3171
3172			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3173			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3174			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3175
3176			alu.dst.sel = tmp1;
3177			alu.dst.chan = 3;
3178			alu.dst.write = 1;
3179
3180			alu.src[0].sel = tmp0;
3181			alu.src[0].chan = 3;
3182			if (signed_op) {
3183				alu.src[1].sel = tmp2;
3184				alu.src[1].chan = 1;
3185			} else {
3186				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3187			}
3188
3189			alu.last = 1;
3190			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3191				return r;
3192
3193		} else { /* UDIV */
3194
3195			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3196			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3197			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3198
3199			alu.dst.sel = tmp1;
3200			alu.dst.chan = 2;
3201			alu.dst.write = 1;
3202
3203			alu.src[0].sel = tmp0;
3204			alu.src[0].chan = 2;
3205			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3206
3207			alu.last = 1;
3208			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3209				return r;
3210
3211			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3212			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3213			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3214
3215			alu.dst.sel = tmp1;
3216			alu.dst.chan = 3;
3217			alu.dst.write = 1;
3218
3219			alu.src[0].sel = tmp0;
3220			alu.src[0].chan = 2;
3221			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3222
3223			alu.last = 1;
3224			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3225				return r;
3226
3227		}
3228
3229		/* 17. tmp1.x = tmp1.x & tmp1.y */
3230		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3231		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3232
3233		alu.dst.sel = tmp1;
3234		alu.dst.chan = 0;
3235		alu.dst.write = 1;
3236
3237		alu.src[0].sel = tmp1;
3238		alu.src[0].chan = 0;
3239		alu.src[1].sel = tmp1;
3240		alu.src[1].chan = 1;
3241
3242		alu.last = 1;
3243		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3244			return r;
3245
3246		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3247		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3248		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3249		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3250		alu.is_op3 = 1;
3251
3252		alu.dst.sel = tmp0;
3253		alu.dst.chan = 2;
3254		alu.dst.write = 1;
3255
3256		alu.src[0].sel = tmp1;
3257		alu.src[0].chan = 0;
3258		alu.src[1].sel = tmp0;
3259		alu.src[1].chan = mod ? 3 : 2;
3260		alu.src[2].sel = tmp1;
3261		alu.src[2].chan = 2;
3262
3263		alu.last = 1;
3264		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3265			return r;
3266
3267		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3268		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3269		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3270		alu.is_op3 = 1;
3271
3272		if (signed_op) {
3273			alu.dst.sel = tmp0;
3274			alu.dst.chan = 2;
3275			alu.dst.write = 1;
3276		} else {
3277			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3278		}
3279
3280		alu.src[0].sel = tmp1;
3281		alu.src[0].chan = 1;
3282		alu.src[1].sel = tmp1;
3283		alu.src[1].chan = 3;
3284		alu.src[2].sel = tmp0;
3285		alu.src[2].chan = 2;
3286
3287		alu.last = 1;
3288		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3289			return r;
3290
3291		if (signed_op) {
3292
3293			/* fix the sign of the result */
3294
3295			if (mod) {
3296
3297				/* tmp0.x = -tmp0.z */
3298				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3299				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3300
3301				alu.dst.sel = tmp0;
3302				alu.dst.chan = 0;
3303				alu.dst.write = 1;
3304
3305				alu.src[0].sel = V_SQ_ALU_SRC_0;
3306				alu.src[1].sel = tmp0;
3307				alu.src[1].chan = 2;
3308
3309				alu.last = 1;
3310				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3311					return r;
3312
3313				/* sign of the remainder is the same as the sign of src0 */
3314				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3315				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3316				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3317				alu.is_op3 = 1;
3318
3319				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3320
3321				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3322				alu.src[1].sel = tmp0;
3323				alu.src[1].chan = 2;
3324				alu.src[2].sel = tmp0;
3325				alu.src[2].chan = 0;
3326
3327				alu.last = 1;
3328				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3329					return r;
3330
3331			} else {
3332
3333				/* tmp0.x = -tmp0.z */
3334				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3335				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3336
3337				alu.dst.sel = tmp0;
3338				alu.dst.chan = 0;
3339				alu.dst.write = 1;
3340
3341				alu.src[0].sel = V_SQ_ALU_SRC_0;
3342				alu.src[1].sel = tmp0;
3343				alu.src[1].chan = 2;
3344
3345				alu.last = 1;
3346				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3347					return r;
3348
3349				/* fix the quotient sign (same as the sign of src0*src1) */
3350				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3351				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3352				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3353				alu.is_op3 = 1;
3354
3355				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3356
3357				alu.src[0].sel = tmp2;
3358				alu.src[0].chan = 2;
3359				alu.src[1].sel = tmp0;
3360				alu.src[1].chan = 2;
3361				alu.src[2].sel = tmp0;
3362				alu.src[2].chan = 0;
3363
3364				alu.last = 1;
3365				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3366					return r;
3367			}
3368		}
3369	}
3370	return 0;
3371}
3372
3373static int tgsi_udiv(struct r600_shader_ctx *ctx)
3374{
3375	return tgsi_divmod(ctx, 0, 0);
3376}
3377
3378static int tgsi_umod(struct r600_shader_ctx *ctx)
3379{
3380	return tgsi_divmod(ctx, 1, 0);
3381}
3382
3383static int tgsi_idiv(struct r600_shader_ctx *ctx)
3384{
3385	return tgsi_divmod(ctx, 0, 1);
3386}
3387
3388static int tgsi_imod(struct r600_shader_ctx *ctx)
3389{
3390	return tgsi_divmod(ctx, 1, 1);
3391}
3392
3393
3394static int tgsi_f2i(struct r600_shader_ctx *ctx)
3395{
3396	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3397	struct r600_bytecode_alu alu;
3398	int i, r;
3399	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3400	int last_inst = tgsi_last_instruction(write_mask);
3401
3402	for (i = 0; i < 4; i++) {
3403		if (!(write_mask & (1<<i)))
3404			continue;
3405
3406		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3407		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3408
3409		alu.dst.sel = ctx->temp_reg;
3410		alu.dst.chan = i;
3411		alu.dst.write = 1;
3412
3413		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3414		if (i == last_inst)
3415			alu.last = 1;
3416		r = r600_bytecode_add_alu(ctx->bc, &alu);
3417		if (r)
3418			return r;
3419	}
3420
3421	for (i = 0; i < 4; i++) {
3422		if (!(write_mask & (1<<i)))
3423			continue;
3424
3425		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3426		alu.inst = ctx->inst_info->r600_opcode;
3427
3428		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3429
3430		alu.src[0].sel = ctx->temp_reg;
3431		alu.src[0].chan = i;
3432
3433		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3434			alu.last = 1;
3435		r = r600_bytecode_add_alu(ctx->bc, &alu);
3436		if (r)
3437			return r;
3438	}
3439
3440	return 0;
3441}
3442
3443static int tgsi_iabs(struct r600_shader_ctx *ctx)
3444{
3445	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3446	struct r600_bytecode_alu alu;
3447	int i, r;
3448	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3449	int last_inst = tgsi_last_instruction(write_mask);
3450
3451	/* tmp = -src */
3452	for (i = 0; i < 4; i++) {
3453		if (!(write_mask & (1<<i)))
3454			continue;
3455
3456		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3457		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3458
3459		alu.dst.sel = ctx->temp_reg;
3460		alu.dst.chan = i;
3461		alu.dst.write = 1;
3462
3463		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3464		alu.src[0].sel = V_SQ_ALU_SRC_0;
3465
3466		if (i == last_inst)
3467			alu.last = 1;
3468		r = r600_bytecode_add_alu(ctx->bc, &alu);
3469		if (r)
3470			return r;
3471	}
3472
3473	/* dst = (src >= 0 ? src : tmp) */
3474	for (i = 0; i < 4; i++) {
3475		if (!(write_mask & (1<<i)))
3476			continue;
3477
3478		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3479		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3480		alu.is_op3 = 1;
3481		alu.dst.write = 1;
3482
3483		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3484
3485		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3486		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3487		alu.src[2].sel = ctx->temp_reg;
3488		alu.src[2].chan = i;
3489
3490		if (i == last_inst)
3491			alu.last = 1;
3492		r = r600_bytecode_add_alu(ctx->bc, &alu);
3493		if (r)
3494			return r;
3495	}
3496	return 0;
3497}
3498
3499static int tgsi_issg(struct r600_shader_ctx *ctx)
3500{
3501	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3502	struct r600_bytecode_alu alu;
3503	int i, r;
3504	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3505	int last_inst = tgsi_last_instruction(write_mask);
3506
3507	/* tmp = (src >= 0 ? src : -1) */
3508	for (i = 0; i < 4; i++) {
3509		if (!(write_mask & (1<<i)))
3510			continue;
3511
3512		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3513		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3514		alu.is_op3 = 1;
3515
3516		alu.dst.sel = ctx->temp_reg;
3517		alu.dst.chan = i;
3518		alu.dst.write = 1;
3519
3520		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3521		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3522		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3523
3524		if (i == last_inst)
3525			alu.last = 1;
3526		r = r600_bytecode_add_alu(ctx->bc, &alu);
3527		if (r)
3528			return r;
3529	}
3530
3531	/* dst = (tmp > 0 ? 1 : tmp) */
3532	for (i = 0; i < 4; i++) {
3533		if (!(write_mask & (1<<i)))
3534			continue;
3535
3536		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3537		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3538		alu.is_op3 = 1;
3539		alu.dst.write = 1;
3540
3541		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3542
3543		alu.src[0].sel = ctx->temp_reg;
3544		alu.src[0].chan = i;
3545
3546		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3547
3548		alu.src[2].sel = ctx->temp_reg;
3549		alu.src[2].chan = i;
3550
3551		if (i == last_inst)
3552			alu.last = 1;
3553		r = r600_bytecode_add_alu(ctx->bc, &alu);
3554		if (r)
3555			return r;
3556	}
3557	return 0;
3558}
3559
3560
3561
3562static int tgsi_ssg(struct r600_shader_ctx *ctx)
3563{
3564	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3565	struct r600_bytecode_alu alu;
3566	int i, r;
3567
3568	/* tmp = (src > 0 ? 1 : src) */
3569	for (i = 0; i < 4; i++) {
3570		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3571		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3572		alu.is_op3 = 1;
3573
3574		alu.dst.sel = ctx->temp_reg;
3575		alu.dst.chan = i;
3576
3577		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3578		alu.src[1].sel = V_SQ_ALU_SRC_1;
3579		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3580
3581		if (i == 3)
3582			alu.last = 1;
3583		r = r600_bytecode_add_alu(ctx->bc, &alu);
3584		if (r)
3585			return r;
3586	}
3587
3588	/* dst = (-tmp > 0 ? -1 : tmp) */
3589	for (i = 0; i < 4; i++) {
3590		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3591		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3592		alu.is_op3 = 1;
3593		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3594
3595		alu.src[0].sel = ctx->temp_reg;
3596		alu.src[0].chan = i;
3597		alu.src[0].neg = 1;
3598
3599		alu.src[1].sel = V_SQ_ALU_SRC_1;
3600		alu.src[1].neg = 1;
3601
3602		alu.src[2].sel = ctx->temp_reg;
3603		alu.src[2].chan = i;
3604
3605		if (i == 3)
3606			alu.last = 1;
3607		r = r600_bytecode_add_alu(ctx->bc, &alu);
3608		if (r)
3609			return r;
3610	}
3611	return 0;
3612}
3613
3614static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3615{
3616	struct r600_bytecode_alu alu;
3617	int i, r;
3618
3619	for (i = 0; i < 4; i++) {
3620		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3621		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3622			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3623			alu.dst.chan = i;
3624		} else {
3625			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3626			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3627			alu.src[0].sel = ctx->temp_reg;
3628			alu.src[0].chan = i;
3629		}
3630		if (i == 3) {
3631			alu.last = 1;
3632		}
3633		r = r600_bytecode_add_alu(ctx->bc, &alu);
3634		if (r)
3635			return r;
3636	}
3637	return 0;
3638}
3639
3640static int tgsi_op3(struct r600_shader_ctx *ctx)
3641{
3642	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3643	struct r600_bytecode_alu alu;
3644	int i, j, r;
3645	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3646
3647	for (i = 0; i < lasti + 1; i++) {
3648		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3649			continue;
3650
3651		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3652		alu.inst = ctx->inst_info->r600_opcode;
3653		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3654			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3655		}
3656
3657		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3658		alu.dst.chan = i;
3659		alu.dst.write = 1;
3660		alu.is_op3 = 1;
3661		if (i == lasti) {
3662			alu.last = 1;
3663		}
3664		r = r600_bytecode_add_alu(ctx->bc, &alu);
3665		if (r)
3666			return r;
3667	}
3668	return 0;
3669}
3670
3671static int tgsi_dp(struct r600_shader_ctx *ctx)
3672{
3673	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3674	struct r600_bytecode_alu alu;
3675	int i, j, r;
3676
3677	for (i = 0; i < 4; i++) {
3678		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3679		alu.inst = ctx->inst_info->r600_opcode;
3680		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3681			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3682		}
3683
3684		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3685		alu.dst.chan = i;
3686		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3687		/* handle some special cases */
3688		switch (ctx->inst_info->tgsi_opcode) {
3689		case TGSI_OPCODE_DP2:
3690			if (i > 1) {
3691				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3692				alu.src[0].chan = alu.src[1].chan = 0;
3693			}
3694			break;
3695		case TGSI_OPCODE_DP3:
3696			if (i > 2) {
3697				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3698				alu.src[0].chan = alu.src[1].chan = 0;
3699			}
3700			break;
3701		case TGSI_OPCODE_DPH:
3702			if (i == 3) {
3703				alu.src[0].sel = V_SQ_ALU_SRC_1;
3704				alu.src[0].chan = 0;
3705				alu.src[0].neg = 0;
3706			}
3707			break;
3708		default:
3709			break;
3710		}
3711		if (i == 3) {
3712			alu.last = 1;
3713		}
3714		r = r600_bytecode_add_alu(ctx->bc, &alu);
3715		if (r)
3716			return r;
3717	}
3718	return 0;
3719}
3720
3721static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3722						    unsigned index)
3723{
3724	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3725	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3726		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3727		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3728		ctx->src[index].neg || ctx->src[index].abs;
3729}
3730
3731static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3732					unsigned index)
3733{
3734	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3735	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3736}
3737
3738static int tgsi_tex(struct r600_shader_ctx *ctx)
3739{
3740	static float one_point_five = 1.5f;
3741	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3742	struct r600_bytecode_tex tex;
3743	struct r600_bytecode_alu alu;
3744	unsigned src_gpr;
3745	int r, i, j;
3746	int opcode;
3747	/* Texture fetch instructions can only use gprs as source.
3748	 * Also they cannot negate the source or take the absolute value */
3749	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3750                                             tgsi_tex_src_requires_loading(ctx, 0);
3751	boolean src_loaded = FALSE;
3752	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3753	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3754
3755	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3756
3757	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3758		/* get offset values */
3759		if (inst->Texture.NumOffsets) {
3760			assert(inst->Texture.NumOffsets == 1);
3761
3762			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3763			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3764			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3765		}
3766	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3767		/* TGSI moves the sampler to src reg 3 for TXD */
3768		sampler_src_reg = 3;
3769
3770		for (i = 1; i < 3; i++) {
3771			/* set gradients h/v */
3772			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3773			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3774				SQ_TEX_INST_SET_GRADIENTS_V;
3775			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3776			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3777
3778			if (tgsi_tex_src_requires_loading(ctx, i)) {
3779				tex.src_gpr = r600_get_temp(ctx);
3780				tex.src_sel_x = 0;
3781				tex.src_sel_y = 1;
3782				tex.src_sel_z = 2;
3783				tex.src_sel_w = 3;
3784
3785				for (j = 0; j < 4; j++) {
3786					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3787					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3788                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3789                                        alu.dst.sel = tex.src_gpr;
3790                                        alu.dst.chan = j;
3791                                        if (j == 3)
3792                                                alu.last = 1;
3793                                        alu.dst.write = 1;
3794                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3795                                        if (r)
3796                                                return r;
3797				}
3798
3799			} else {
3800				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3801				tex.src_sel_x = ctx->src[i].swizzle[0];
3802				tex.src_sel_y = ctx->src[i].swizzle[1];
3803				tex.src_sel_z = ctx->src[i].swizzle[2];
3804				tex.src_sel_w = ctx->src[i].swizzle[3];
3805				tex.src_rel = ctx->src[i].rel;
3806			}
3807			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3808			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3809			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3810				tex.coord_type_x = 1;
3811				tex.coord_type_y = 1;
3812				tex.coord_type_z = 1;
3813				tex.coord_type_w = 1;
3814			}
3815			r = r600_bytecode_add_tex(ctx->bc, &tex);
3816			if (r)
3817				return r;
3818		}
3819	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3820		int out_chan;
3821		/* Add perspective divide */
3822		if (ctx->bc->chip_class == CAYMAN) {
3823			out_chan = 2;
3824			for (i = 0; i < 3; i++) {
3825				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3826				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3827				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3828
3829				alu.dst.sel = ctx->temp_reg;
3830				alu.dst.chan = i;
3831				if (i == 2)
3832					alu.last = 1;
3833				if (out_chan == i)
3834					alu.dst.write = 1;
3835				r = r600_bytecode_add_alu(ctx->bc, &alu);
3836				if (r)
3837					return r;
3838			}
3839
3840		} else {
3841			out_chan = 3;
3842			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3843			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3844			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3845
3846			alu.dst.sel = ctx->temp_reg;
3847			alu.dst.chan = out_chan;
3848			alu.last = 1;
3849			alu.dst.write = 1;
3850			r = r600_bytecode_add_alu(ctx->bc, &alu);
3851			if (r)
3852				return r;
3853		}
3854
3855		for (i = 0; i < 3; i++) {
3856			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3857			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3858			alu.src[0].sel = ctx->temp_reg;
3859			alu.src[0].chan = out_chan;
3860			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3861			alu.dst.sel = ctx->temp_reg;
3862			alu.dst.chan = i;
3863			alu.dst.write = 1;
3864			r = r600_bytecode_add_alu(ctx->bc, &alu);
3865			if (r)
3866				return r;
3867		}
3868		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3869		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3870		alu.src[0].sel = V_SQ_ALU_SRC_1;
3871		alu.src[0].chan = 0;
3872		alu.dst.sel = ctx->temp_reg;
3873		alu.dst.chan = 3;
3874		alu.last = 1;
3875		alu.dst.write = 1;
3876		r = r600_bytecode_add_alu(ctx->bc, &alu);
3877		if (r)
3878			return r;
3879		src_loaded = TRUE;
3880		src_gpr = ctx->temp_reg;
3881	}
3882
3883	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3884	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3885	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3886	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3887
3888		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3889		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3890
3891		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3892		for (i = 0; i < 4; i++) {
3893			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3894			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3895			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3896			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3897			alu.dst.sel = ctx->temp_reg;
3898			alu.dst.chan = i;
3899			if (i == 3)
3900				alu.last = 1;
3901			alu.dst.write = 1;
3902			r = r600_bytecode_add_alu(ctx->bc, &alu);
3903			if (r)
3904				return r;
3905		}
3906
3907		/* tmp1.z = RCP_e(|tmp1.z|) */
3908		if (ctx->bc->chip_class == CAYMAN) {
3909			for (i = 0; i < 3; i++) {
3910				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3911				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3912				alu.src[0].sel = ctx->temp_reg;
3913				alu.src[0].chan = 2;
3914				alu.src[0].abs = 1;
3915				alu.dst.sel = ctx->temp_reg;
3916				alu.dst.chan = i;
3917				if (i == 2)
3918					alu.dst.write = 1;
3919				if (i == 2)
3920					alu.last = 1;
3921				r = r600_bytecode_add_alu(ctx->bc, &alu);
3922				if (r)
3923					return r;
3924			}
3925		} else {
3926			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3927			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3928			alu.src[0].sel = ctx->temp_reg;
3929			alu.src[0].chan = 2;
3930			alu.src[0].abs = 1;
3931			alu.dst.sel = ctx->temp_reg;
3932			alu.dst.chan = 2;
3933			alu.dst.write = 1;
3934			alu.last = 1;
3935			r = r600_bytecode_add_alu(ctx->bc, &alu);
3936			if (r)
3937				return r;
3938		}
3939
3940		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3941		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3942		 * muladd has no writemask, have to use another temp
3943		 */
3944		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3945		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3946		alu.is_op3 = 1;
3947
3948		alu.src[0].sel = ctx->temp_reg;
3949		alu.src[0].chan = 0;
3950		alu.src[1].sel = ctx->temp_reg;
3951		alu.src[1].chan = 2;
3952
3953		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3954		alu.src[2].chan = 0;
3955		alu.src[2].value = *(uint32_t *)&one_point_five;
3956
3957		alu.dst.sel = ctx->temp_reg;
3958		alu.dst.chan = 0;
3959		alu.dst.write = 1;
3960
3961		r = r600_bytecode_add_alu(ctx->bc, &alu);
3962		if (r)
3963			return r;
3964
3965		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3966		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3967		alu.is_op3 = 1;
3968
3969		alu.src[0].sel = ctx->temp_reg;
3970		alu.src[0].chan = 1;
3971		alu.src[1].sel = ctx->temp_reg;
3972		alu.src[1].chan = 2;
3973
3974		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3975		alu.src[2].chan = 0;
3976		alu.src[2].value = *(uint32_t *)&one_point_five;
3977
3978		alu.dst.sel = ctx->temp_reg;
3979		alu.dst.chan = 1;
3980		alu.dst.write = 1;
3981
3982		alu.last = 1;
3983		r = r600_bytecode_add_alu(ctx->bc, &alu);
3984		if (r)
3985			return r;
3986		/* write initial W value into Z component */
3987		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
3988			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3989			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3990			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3991			alu.dst.sel = ctx->temp_reg;
3992			alu.dst.chan = 2;
3993			alu.dst.write = 1;
3994			alu.last = 1;
3995			r = r600_bytecode_add_alu(ctx->bc, &alu);
3996			if (r)
3997				return r;
3998		}
3999		src_loaded = TRUE;
4000		src_gpr = ctx->temp_reg;
4001	}
4002
4003	if (src_requires_loading && !src_loaded) {
4004		for (i = 0; i < 4; i++) {
4005			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4006			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4007			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4008			alu.dst.sel = ctx->temp_reg;
4009			alu.dst.chan = i;
4010			if (i == 3)
4011				alu.last = 1;
4012			alu.dst.write = 1;
4013			r = r600_bytecode_add_alu(ctx->bc, &alu);
4014			if (r)
4015				return r;
4016		}
4017		src_loaded = TRUE;
4018		src_gpr = ctx->temp_reg;
4019	}
4020
4021	opcode = ctx->inst_info->r600_opcode;
4022	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4023	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4024	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4025	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4026	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4027	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4028		switch (opcode) {
4029		case SQ_TEX_INST_SAMPLE:
4030			opcode = SQ_TEX_INST_SAMPLE_C;
4031			break;
4032		case SQ_TEX_INST_SAMPLE_L:
4033			opcode = SQ_TEX_INST_SAMPLE_C_L;
4034			break;
4035		case SQ_TEX_INST_SAMPLE_LB:
4036			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4037			break;
4038		case SQ_TEX_INST_SAMPLE_G:
4039			opcode = SQ_TEX_INST_SAMPLE_C_G;
4040			break;
4041		}
4042	}
4043
4044	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4045	tex.inst = opcode;
4046
4047	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4048	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4049	tex.src_gpr = src_gpr;
4050	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4051	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4052	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4053	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4054	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4055
4056	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4057		tex.src_sel_x = 4;
4058		tex.src_sel_y = 4;
4059		tex.src_sel_z = 4;
4060		tex.src_sel_w = 4;
4061	} else if (src_loaded) {
4062		tex.src_sel_x = 0;
4063		tex.src_sel_y = 1;
4064		tex.src_sel_z = 2;
4065		tex.src_sel_w = 3;
4066	} else {
4067		tex.src_sel_x = ctx->src[0].swizzle[0];
4068		tex.src_sel_y = ctx->src[0].swizzle[1];
4069		tex.src_sel_z = ctx->src[0].swizzle[2];
4070		tex.src_sel_w = ctx->src[0].swizzle[3];
4071		tex.src_rel = ctx->src[0].rel;
4072	}
4073
4074	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4075		tex.src_sel_x = 1;
4076		tex.src_sel_y = 0;
4077		tex.src_sel_z = 3;
4078		tex.src_sel_w = 1;
4079	}
4080	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4081		tex.src_sel_x = 1;
4082		tex.src_sel_y = 0;
4083		tex.src_sel_z = 3;
4084		tex.src_sel_w = 2; /* route Z compare value into W */
4085	}
4086
4087	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4088	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4089		tex.coord_type_x = 1;
4090		tex.coord_type_y = 1;
4091	}
4092	tex.coord_type_z = 1;
4093	tex.coord_type_w = 1;
4094
4095	tex.offset_x = offset_x;
4096	tex.offset_y = offset_y;
4097	tex.offset_z = offset_z;
4098
4099	/* Put the depth for comparison in W.
4100	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4101	 * Some instructions expect the depth in Z. */
4102	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4103	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4104	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4105	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4106	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4107	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4108		tex.src_sel_w = tex.src_sel_z;
4109	}
4110
4111	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4112	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4113		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4114		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4115			/* the array index is read from Y */
4116			tex.coord_type_y = 0;
4117		} else {
4118			/* the array index is read from Z */
4119			tex.coord_type_z = 0;
4120			tex.src_sel_z = tex.src_sel_y;
4121		}
4122	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4123		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4124		/* the array index is read from Z */
4125		tex.coord_type_z = 0;
4126
4127	r = r600_bytecode_add_tex(ctx->bc, &tex);
4128	if (r)
4129		return r;
4130
4131	/* add shadow ambient support  - gallium doesn't do it yet */
4132	return 0;
4133}
4134
4135static int tgsi_lrp(struct r600_shader_ctx *ctx)
4136{
4137	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4138	struct r600_bytecode_alu alu;
4139	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4140	unsigned i;
4141	int r;
4142
4143	/* optimize if it's just an equal balance */
4144	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4145		for (i = 0; i < lasti + 1; i++) {
4146			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4147				continue;
4148
4149			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4150			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4151			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4152			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4153			alu.omod = 3;
4154			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4155			alu.dst.chan = i;
4156			if (i == lasti) {
4157				alu.last = 1;
4158			}
4159			r = r600_bytecode_add_alu(ctx->bc, &alu);
4160			if (r)
4161				return r;
4162		}
4163		return 0;
4164	}
4165
4166	/* 1 - src0 */
4167	for (i = 0; i < lasti + 1; i++) {
4168		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4169			continue;
4170
4171		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4172		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4173		alu.src[0].sel = V_SQ_ALU_SRC_1;
4174		alu.src[0].chan = 0;
4175		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4176		r600_bytecode_src_toggle_neg(&alu.src[1]);
4177		alu.dst.sel = ctx->temp_reg;
4178		alu.dst.chan = i;
4179		if (i == lasti) {
4180			alu.last = 1;
4181		}
4182		alu.dst.write = 1;
4183		r = r600_bytecode_add_alu(ctx->bc, &alu);
4184		if (r)
4185			return r;
4186	}
4187
4188	/* (1 - src0) * src2 */
4189	for (i = 0; i < lasti + 1; i++) {
4190		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4191			continue;
4192
4193		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4194		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4195		alu.src[0].sel = ctx->temp_reg;
4196		alu.src[0].chan = i;
4197		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4198		alu.dst.sel = ctx->temp_reg;
4199		alu.dst.chan = i;
4200		if (i == lasti) {
4201			alu.last = 1;
4202		}
4203		alu.dst.write = 1;
4204		r = r600_bytecode_add_alu(ctx->bc, &alu);
4205		if (r)
4206			return r;
4207	}
4208
4209	/* src0 * src1 + (1 - src0) * src2 */
4210	for (i = 0; i < lasti + 1; i++) {
4211		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4212			continue;
4213
4214		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4215		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4216		alu.is_op3 = 1;
4217		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4218		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4219		alu.src[2].sel = ctx->temp_reg;
4220		alu.src[2].chan = i;
4221
4222		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4223		alu.dst.chan = i;
4224		if (i == lasti) {
4225			alu.last = 1;
4226		}
4227		r = r600_bytecode_add_alu(ctx->bc, &alu);
4228		if (r)
4229			return r;
4230	}
4231	return 0;
4232}
4233
4234static int tgsi_cmp(struct r600_shader_ctx *ctx)
4235{
4236	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4237	struct r600_bytecode_alu alu;
4238	int i, r;
4239	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4240
4241	for (i = 0; i < lasti + 1; i++) {
4242		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4243			continue;
4244
4245		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4246		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4247		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4248		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4249		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4250		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4251		alu.dst.chan = i;
4252		alu.dst.write = 1;
4253		alu.is_op3 = 1;
4254		if (i == lasti)
4255			alu.last = 1;
4256		r = r600_bytecode_add_alu(ctx->bc, &alu);
4257		if (r)
4258			return r;
4259	}
4260	return 0;
4261}
4262
4263static int tgsi_xpd(struct r600_shader_ctx *ctx)
4264{
4265	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4266	static const unsigned int src0_swizzle[] = {2, 0, 1};
4267	static const unsigned int src1_swizzle[] = {1, 2, 0};
4268	struct r600_bytecode_alu alu;
4269	uint32_t use_temp = 0;
4270	int i, r;
4271
4272	if (inst->Dst[0].Register.WriteMask != 0xf)
4273		use_temp = 1;
4274
4275	for (i = 0; i < 4; i++) {
4276		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4277		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4278		if (i < 3) {
4279			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4280			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4281		} else {
4282			alu.src[0].sel = V_SQ_ALU_SRC_0;
4283			alu.src[0].chan = i;
4284			alu.src[1].sel = V_SQ_ALU_SRC_0;
4285			alu.src[1].chan = i;
4286		}
4287
4288		alu.dst.sel = ctx->temp_reg;
4289		alu.dst.chan = i;
4290		alu.dst.write = 1;
4291
4292		if (i == 3)
4293			alu.last = 1;
4294		r = r600_bytecode_add_alu(ctx->bc, &alu);
4295		if (r)
4296			return r;
4297	}
4298
4299	for (i = 0; i < 4; i++) {
4300		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4301		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4302
4303		if (i < 3) {
4304			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4305			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4306		} else {
4307			alu.src[0].sel = V_SQ_ALU_SRC_0;
4308			alu.src[0].chan = i;
4309			alu.src[1].sel = V_SQ_ALU_SRC_0;
4310			alu.src[1].chan = i;
4311		}
4312
4313		alu.src[2].sel = ctx->temp_reg;
4314		alu.src[2].neg = 1;
4315		alu.src[2].chan = i;
4316
4317		if (use_temp)
4318			alu.dst.sel = ctx->temp_reg;
4319		else
4320			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4321		alu.dst.chan = i;
4322		alu.dst.write = 1;
4323		alu.is_op3 = 1;
4324		if (i == 3)
4325			alu.last = 1;
4326		r = r600_bytecode_add_alu(ctx->bc, &alu);
4327		if (r)
4328			return r;
4329	}
4330	if (use_temp)
4331		return tgsi_helper_copy(ctx, inst);
4332	return 0;
4333}
4334
4335static int tgsi_exp(struct r600_shader_ctx *ctx)
4336{
4337	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4338	struct r600_bytecode_alu alu;
4339	int r;
4340	int i;
4341
4342	/* result.x = 2^floor(src); */
4343	if (inst->Dst[0].Register.WriteMask & 1) {
4344		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4345
4346		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4347		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4348
4349		alu.dst.sel = ctx->temp_reg;
4350		alu.dst.chan = 0;
4351		alu.dst.write = 1;
4352		alu.last = 1;
4353		r = r600_bytecode_add_alu(ctx->bc, &alu);
4354		if (r)
4355			return r;
4356
4357		if (ctx->bc->chip_class == CAYMAN) {
4358			for (i = 0; i < 3; i++) {
4359				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4360				alu.src[0].sel = ctx->temp_reg;
4361				alu.src[0].chan = 0;
4362
4363				alu.dst.sel = ctx->temp_reg;
4364				alu.dst.chan = i;
4365				if (i == 0)
4366					alu.dst.write = 1;
4367				if (i == 2)
4368					alu.last = 1;
4369				r = r600_bytecode_add_alu(ctx->bc, &alu);
4370				if (r)
4371					return r;
4372			}
4373		} else {
4374			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4375			alu.src[0].sel = ctx->temp_reg;
4376			alu.src[0].chan = 0;
4377
4378			alu.dst.sel = ctx->temp_reg;
4379			alu.dst.chan = 0;
4380			alu.dst.write = 1;
4381			alu.last = 1;
4382			r = r600_bytecode_add_alu(ctx->bc, &alu);
4383			if (r)
4384				return r;
4385		}
4386	}
4387
4388	/* result.y = tmp - floor(tmp); */
4389	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4390		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4391
4392		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4393		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4394
4395		alu.dst.sel = ctx->temp_reg;
4396#if 0
4397		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4398		if (r)
4399			return r;
4400#endif
4401		alu.dst.write = 1;
4402		alu.dst.chan = 1;
4403
4404		alu.last = 1;
4405
4406		r = r600_bytecode_add_alu(ctx->bc, &alu);
4407		if (r)
4408			return r;
4409	}
4410
4411	/* result.z = RoughApprox2ToX(tmp);*/
4412	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4413		if (ctx->bc->chip_class == CAYMAN) {
4414			for (i = 0; i < 3; i++) {
4415				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4416				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4417				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4418
4419				alu.dst.sel = ctx->temp_reg;
4420				alu.dst.chan = i;
4421				if (i == 2) {
4422					alu.dst.write = 1;
4423					alu.last = 1;
4424				}
4425
4426				r = r600_bytecode_add_alu(ctx->bc, &alu);
4427				if (r)
4428					return r;
4429			}
4430		} else {
4431			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4432			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4433			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4434
4435			alu.dst.sel = ctx->temp_reg;
4436			alu.dst.write = 1;
4437			alu.dst.chan = 2;
4438
4439			alu.last = 1;
4440
4441			r = r600_bytecode_add_alu(ctx->bc, &alu);
4442			if (r)
4443				return r;
4444		}
4445	}
4446
4447	/* result.w = 1.0;*/
4448	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4449		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4450
4451		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4452		alu.src[0].sel = V_SQ_ALU_SRC_1;
4453		alu.src[0].chan = 0;
4454
4455		alu.dst.sel = ctx->temp_reg;
4456		alu.dst.chan = 3;
4457		alu.dst.write = 1;
4458		alu.last = 1;
4459		r = r600_bytecode_add_alu(ctx->bc, &alu);
4460		if (r)
4461			return r;
4462	}
4463	return tgsi_helper_copy(ctx, inst);
4464}
4465
4466static int tgsi_log(struct r600_shader_ctx *ctx)
4467{
4468	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4469	struct r600_bytecode_alu alu;
4470	int r;
4471	int i;
4472
4473	/* result.x = floor(log2(|src|)); */
4474	if (inst->Dst[0].Register.WriteMask & 1) {
4475		if (ctx->bc->chip_class == CAYMAN) {
4476			for (i = 0; i < 3; i++) {
4477				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4478
4479				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4480				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4481				r600_bytecode_src_set_abs(&alu.src[0]);
4482
4483				alu.dst.sel = ctx->temp_reg;
4484				alu.dst.chan = i;
4485				if (i == 0)
4486					alu.dst.write = 1;
4487				if (i == 2)
4488					alu.last = 1;
4489				r = r600_bytecode_add_alu(ctx->bc, &alu);
4490				if (r)
4491					return r;
4492			}
4493
4494		} else {
4495			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4496
4497			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4498			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4499			r600_bytecode_src_set_abs(&alu.src[0]);
4500
4501			alu.dst.sel = ctx->temp_reg;
4502			alu.dst.chan = 0;
4503			alu.dst.write = 1;
4504			alu.last = 1;
4505			r = r600_bytecode_add_alu(ctx->bc, &alu);
4506			if (r)
4507				return r;
4508		}
4509
4510		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4511		alu.src[0].sel = ctx->temp_reg;
4512		alu.src[0].chan = 0;
4513
4514		alu.dst.sel = ctx->temp_reg;
4515		alu.dst.chan = 0;
4516		alu.dst.write = 1;
4517		alu.last = 1;
4518
4519		r = r600_bytecode_add_alu(ctx->bc, &alu);
4520		if (r)
4521			return r;
4522	}
4523
4524	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4525	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4526
4527		if (ctx->bc->chip_class == CAYMAN) {
4528			for (i = 0; i < 3; i++) {
4529				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4530
4531				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4532				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4533				r600_bytecode_src_set_abs(&alu.src[0]);
4534
4535				alu.dst.sel = ctx->temp_reg;
4536				alu.dst.chan = i;
4537				if (i == 1)
4538					alu.dst.write = 1;
4539				if (i == 2)
4540					alu.last = 1;
4541
4542				r = r600_bytecode_add_alu(ctx->bc, &alu);
4543				if (r)
4544					return r;
4545			}
4546		} else {
4547			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4548
4549			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4550			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4551			r600_bytecode_src_set_abs(&alu.src[0]);
4552
4553			alu.dst.sel = ctx->temp_reg;
4554			alu.dst.chan = 1;
4555			alu.dst.write = 1;
4556			alu.last = 1;
4557
4558			r = r600_bytecode_add_alu(ctx->bc, &alu);
4559			if (r)
4560				return r;
4561		}
4562
4563		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4564
4565		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4566		alu.src[0].sel = ctx->temp_reg;
4567		alu.src[0].chan = 1;
4568
4569		alu.dst.sel = ctx->temp_reg;
4570		alu.dst.chan = 1;
4571		alu.dst.write = 1;
4572		alu.last = 1;
4573
4574		r = r600_bytecode_add_alu(ctx->bc, &alu);
4575		if (r)
4576			return r;
4577
4578		if (ctx->bc->chip_class == CAYMAN) {
4579			for (i = 0; i < 3; i++) {
4580				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4581				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4582				alu.src[0].sel = ctx->temp_reg;
4583				alu.src[0].chan = 1;
4584
4585				alu.dst.sel = ctx->temp_reg;
4586				alu.dst.chan = i;
4587				if (i == 1)
4588					alu.dst.write = 1;
4589				if (i == 2)
4590					alu.last = 1;
4591
4592				r = r600_bytecode_add_alu(ctx->bc, &alu);
4593				if (r)
4594					return r;
4595			}
4596		} else {
4597			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4598			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4599			alu.src[0].sel = ctx->temp_reg;
4600			alu.src[0].chan = 1;
4601
4602			alu.dst.sel = ctx->temp_reg;
4603			alu.dst.chan = 1;
4604			alu.dst.write = 1;
4605			alu.last = 1;
4606
4607			r = r600_bytecode_add_alu(ctx->bc, &alu);
4608			if (r)
4609				return r;
4610		}
4611
4612		if (ctx->bc->chip_class == CAYMAN) {
4613			for (i = 0; i < 3; i++) {
4614				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4615				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4616				alu.src[0].sel = ctx->temp_reg;
4617				alu.src[0].chan = 1;
4618
4619				alu.dst.sel = ctx->temp_reg;
4620				alu.dst.chan = i;
4621				if (i == 1)
4622					alu.dst.write = 1;
4623				if (i == 2)
4624					alu.last = 1;
4625
4626				r = r600_bytecode_add_alu(ctx->bc, &alu);
4627				if (r)
4628					return r;
4629			}
4630		} else {
4631			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4632			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4633			alu.src[0].sel = ctx->temp_reg;
4634			alu.src[0].chan = 1;
4635
4636			alu.dst.sel = ctx->temp_reg;
4637			alu.dst.chan = 1;
4638			alu.dst.write = 1;
4639			alu.last = 1;
4640
4641			r = r600_bytecode_add_alu(ctx->bc, &alu);
4642			if (r)
4643				return r;
4644		}
4645
4646		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4647
4648		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4649
4650		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4651		r600_bytecode_src_set_abs(&alu.src[0]);
4652
4653		alu.src[1].sel = ctx->temp_reg;
4654		alu.src[1].chan = 1;
4655
4656		alu.dst.sel = ctx->temp_reg;
4657		alu.dst.chan = 1;
4658		alu.dst.write = 1;
4659		alu.last = 1;
4660
4661		r = r600_bytecode_add_alu(ctx->bc, &alu);
4662		if (r)
4663			return r;
4664	}
4665
4666	/* result.z = log2(|src|);*/
4667	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4668		if (ctx->bc->chip_class == CAYMAN) {
4669			for (i = 0; i < 3; i++) {
4670				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4671
4672				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4673				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4674				r600_bytecode_src_set_abs(&alu.src[0]);
4675
4676				alu.dst.sel = ctx->temp_reg;
4677				if (i == 2)
4678					alu.dst.write = 1;
4679				alu.dst.chan = i;
4680				if (i == 2)
4681					alu.last = 1;
4682
4683				r = r600_bytecode_add_alu(ctx->bc, &alu);
4684				if (r)
4685					return r;
4686			}
4687		} else {
4688			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4689
4690			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4691			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4692			r600_bytecode_src_set_abs(&alu.src[0]);
4693
4694			alu.dst.sel = ctx->temp_reg;
4695			alu.dst.write = 1;
4696			alu.dst.chan = 2;
4697			alu.last = 1;
4698
4699			r = r600_bytecode_add_alu(ctx->bc, &alu);
4700			if (r)
4701				return r;
4702		}
4703	}
4704
4705	/* result.w = 1.0; */
4706	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4707		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4708
4709		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4710		alu.src[0].sel = V_SQ_ALU_SRC_1;
4711		alu.src[0].chan = 0;
4712
4713		alu.dst.sel = ctx->temp_reg;
4714		alu.dst.chan = 3;
4715		alu.dst.write = 1;
4716		alu.last = 1;
4717
4718		r = r600_bytecode_add_alu(ctx->bc, &alu);
4719		if (r)
4720			return r;
4721	}
4722
4723	return tgsi_helper_copy(ctx, inst);
4724}
4725
4726static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4727{
4728	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4729	struct r600_bytecode_alu alu;
4730	int r;
4731
4732	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4733
4734	switch (inst->Instruction.Opcode) {
4735	case TGSI_OPCODE_ARL:
4736		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4737		break;
4738	case TGSI_OPCODE_ARR:
4739		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4740		break;
4741	case TGSI_OPCODE_UARL:
4742		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4743		break;
4744	default:
4745		assert(0);
4746		return -1;
4747	}
4748
4749	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4750	alu.last = 1;
4751	alu.dst.sel = ctx->bc->ar_reg;
4752	alu.dst.write = 1;
4753	r = r600_bytecode_add_alu(ctx->bc, &alu);
4754	if (r)
4755		return r;
4756
4757	ctx->bc->ar_loaded = 0;
4758	return 0;
4759}
4760static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4761{
4762	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4763	struct r600_bytecode_alu alu;
4764	int r;
4765
4766	switch (inst->Instruction.Opcode) {
4767	case TGSI_OPCODE_ARL:
4768		memset(&alu, 0, sizeof(alu));
4769		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4770		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4771		alu.dst.sel = ctx->bc->ar_reg;
4772		alu.dst.write = 1;
4773		alu.last = 1;
4774
4775		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4776			return r;
4777
4778		memset(&alu, 0, sizeof(alu));
4779		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4780		alu.src[0].sel = ctx->bc->ar_reg;
4781		alu.dst.sel = ctx->bc->ar_reg;
4782		alu.dst.write = 1;
4783		alu.last = 1;
4784
4785		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4786			return r;
4787		break;
4788	case TGSI_OPCODE_ARR:
4789		memset(&alu, 0, sizeof(alu));
4790		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4791		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4792		alu.dst.sel = ctx->bc->ar_reg;
4793		alu.dst.write = 1;
4794		alu.last = 1;
4795
4796		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4797			return r;
4798		break;
4799	case TGSI_OPCODE_UARL:
4800		memset(&alu, 0, sizeof(alu));
4801		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4802		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4803		alu.dst.sel = ctx->bc->ar_reg;
4804		alu.dst.write = 1;
4805		alu.last = 1;
4806
4807		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4808			return r;
4809		break;
4810	default:
4811		assert(0);
4812		return -1;
4813	}
4814
4815	ctx->bc->ar_loaded = 0;
4816	return 0;
4817}
4818
4819static int tgsi_opdst(struct r600_shader_ctx *ctx)
4820{
4821	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4822	struct r600_bytecode_alu alu;
4823	int i, r = 0;
4824
4825	for (i = 0; i < 4; i++) {
4826		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4827
4828		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4829		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4830
4831		if (i == 0 || i == 3) {
4832			alu.src[0].sel = V_SQ_ALU_SRC_1;
4833		} else {
4834			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4835		}
4836
4837		if (i == 0 || i == 2) {
4838			alu.src[1].sel = V_SQ_ALU_SRC_1;
4839		} else {
4840			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4841		}
4842		if (i == 3)
4843			alu.last = 1;
4844		r = r600_bytecode_add_alu(ctx->bc, &alu);
4845		if (r)
4846			return r;
4847	}
4848	return 0;
4849}
4850
4851static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4852{
4853	struct r600_bytecode_alu alu;
4854	int r;
4855
4856	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4857	alu.inst = opcode;
4858	alu.execute_mask = 1;
4859	alu.update_pred = 1;
4860
4861	alu.dst.sel = ctx->temp_reg;
4862	alu.dst.write = 1;
4863	alu.dst.chan = 0;
4864
4865	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4866	alu.src[1].sel = V_SQ_ALU_SRC_0;
4867	alu.src[1].chan = 0;
4868
4869	alu.last = 1;
4870
4871	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4872	if (r)
4873		return r;
4874	return 0;
4875}
4876
4877static int pops(struct r600_shader_ctx *ctx, int pops)
4878{
4879	unsigned force_pop = ctx->bc->force_add_cf;
4880
4881	if (!force_pop) {
4882		int alu_pop = 3;
4883		if (ctx->bc->cf_last) {
4884			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4885				alu_pop = 0;
4886			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4887				alu_pop = 1;
4888		}
4889		alu_pop += pops;
4890		if (alu_pop == 1) {
4891			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4892			ctx->bc->force_add_cf = 1;
4893		} else if (alu_pop == 2) {
4894			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4895			ctx->bc->force_add_cf = 1;
4896		} else {
4897			force_pop = 1;
4898		}
4899	}
4900
4901	if (force_pop) {
4902		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4903		ctx->bc->cf_last->pop_count = pops;
4904		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4905	}
4906
4907	return 0;
4908}
4909
4910static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4911{
4912	switch(reason) {
4913	case FC_PUSH_VPM:
4914		ctx->bc->callstack[ctx->bc->call_sp].current--;
4915		break;
4916	case FC_PUSH_WQM:
4917	case FC_LOOP:
4918		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4919		break;
4920	case FC_REP:
4921		/* TOODO : for 16 vp asic should -= 2; */
4922		ctx->bc->callstack[ctx->bc->call_sp].current --;
4923		break;
4924	}
4925}
4926
4927static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4928{
4929	if (check_max_only) {
4930		int diff;
4931		switch (reason) {
4932		case FC_PUSH_VPM:
4933			diff = 1;
4934			break;
4935		case FC_PUSH_WQM:
4936			diff = 4;
4937			break;
4938		default:
4939			assert(0);
4940			diff = 0;
4941		}
4942		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4943		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4944			ctx->bc->callstack[ctx->bc->call_sp].max =
4945				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4946		}
4947		return;
4948	}
4949	switch (reason) {
4950	case FC_PUSH_VPM:
4951		ctx->bc->callstack[ctx->bc->call_sp].current++;
4952		break;
4953	case FC_PUSH_WQM:
4954	case FC_LOOP:
4955		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
4956		break;
4957	case FC_REP:
4958		ctx->bc->callstack[ctx->bc->call_sp].current++;
4959		break;
4960	}
4961
4962	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
4963	    ctx->bc->callstack[ctx->bc->call_sp].max) {
4964		ctx->bc->callstack[ctx->bc->call_sp].max =
4965			ctx->bc->callstack[ctx->bc->call_sp].current;
4966	}
4967}
4968
4969static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
4970{
4971	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
4972
4973	sp->mid = (struct r600_bytecode_cf **)realloc((void *)sp->mid,
4974						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
4975	sp->mid[sp->num_mid] = ctx->bc->cf_last;
4976	sp->num_mid++;
4977}
4978
4979static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
4980{
4981	ctx->bc->fc_sp++;
4982	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
4983	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
4984}
4985
4986static void fc_poplevel(struct r600_shader_ctx *ctx)
4987{
4988	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
4989	if (sp->mid) {
4990		free(sp->mid);
4991		sp->mid = NULL;
4992	}
4993	sp->num_mid = 0;
4994	sp->start = NULL;
4995	sp->type = 0;
4996	ctx->bc->fc_sp--;
4997}
4998
4999#if 0
5000static int emit_return(struct r600_shader_ctx *ctx)
5001{
5002	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5003	return 0;
5004}
5005
5006static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5007{
5008
5009	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5010	ctx->bc->cf_last->pop_count = pops;
5011	/* XXX work out offset */
5012	return 0;
5013}
5014
5015static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5016{
5017	return 0;
5018}
5019
5020static void emit_testflag(struct r600_shader_ctx *ctx)
5021{
5022
5023}
5024
5025static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5026{
5027	emit_testflag(ctx);
5028	emit_jump_to_offset(ctx, 1, 4);
5029	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5030	pops(ctx, ifidx + 1);
5031	emit_return(ctx);
5032}
5033
5034static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5035{
5036	emit_testflag(ctx);
5037
5038	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5039	ctx->bc->cf_last->pop_count = 1;
5040
5041	fc_set_mid(ctx, fc_sp);
5042
5043	pops(ctx, 1);
5044}
5045#endif
5046
5047static int tgsi_if(struct r600_shader_ctx *ctx)
5048{
5049	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5050
5051	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5052
5053	fc_pushlevel(ctx, FC_IF);
5054
5055	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5056	return 0;
5057}
5058
5059static int tgsi_else(struct r600_shader_ctx *ctx)
5060{
5061	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5062	ctx->bc->cf_last->pop_count = 1;
5063
5064	fc_set_mid(ctx, ctx->bc->fc_sp);
5065	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5066	return 0;
5067}
5068
5069static int tgsi_endif(struct r600_shader_ctx *ctx)
5070{
5071	pops(ctx, 1);
5072	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5073		R600_ERR("if/endif unbalanced in shader\n");
5074		return -1;
5075	}
5076
5077	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5078		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5079		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5080	} else {
5081		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5082	}
5083	fc_poplevel(ctx);
5084
5085	callstack_decrease_current(ctx, FC_PUSH_VPM);
5086	return 0;
5087}
5088
5089static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5090{
5091	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL));
5092
5093	fc_pushlevel(ctx, FC_LOOP);
5094
5095	/* check stack depth */
5096	callstack_check_depth(ctx, FC_LOOP, 0);
5097	return 0;
5098}
5099
5100static int tgsi_endloop(struct r600_shader_ctx *ctx)
5101{
5102	int i;
5103
5104	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5105
5106	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5107		R600_ERR("loop/endloop in shader code are not paired.\n");
5108		return -EINVAL;
5109	}
5110
5111	/* fixup loop pointers - from r600isa
5112	   LOOP END points to CF after LOOP START,
5113	   LOOP START point to CF after LOOP END
5114	   BRK/CONT point to LOOP END CF
5115	*/
5116	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5117
5118	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5119
5120	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5121		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5122	}
5123	/* XXX add LOOPRET support */
5124	fc_poplevel(ctx);
5125	callstack_decrease_current(ctx, FC_LOOP);
5126	return 0;
5127}
5128
5129static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5130{
5131	unsigned int fscp;
5132
5133	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5134	{
5135		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5136			break;
5137	}
5138
5139	if (fscp == 0) {
5140		R600_ERR("Break not inside loop/endloop pair\n");
5141		return -EINVAL;
5142	}
5143
5144	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5145
5146	fc_set_mid(ctx, fscp);
5147
5148	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5149	return 0;
5150}
5151
5152static int tgsi_umad(struct r600_shader_ctx *ctx)
5153{
5154	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5155	struct r600_bytecode_alu alu;
5156	int i, j, r;
5157	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5158
5159	/* src0 * src1 */
5160	for (i = 0; i < lasti + 1; i++) {
5161		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5162			continue;
5163
5164		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5165
5166		alu.dst.chan = i;
5167		alu.dst.sel = ctx->temp_reg;
5168		alu.dst.write = 1;
5169
5170		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5171		for (j = 0; j < 2; j++) {
5172		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5173		}
5174
5175		alu.last = 1;
5176		r = r600_bytecode_add_alu(ctx->bc, &alu);
5177		if (r)
5178			return r;
5179	}
5180
5181
5182	for (i = 0; i < lasti + 1; i++) {
5183		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5184			continue;
5185
5186		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5187		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5188
5189		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5190
5191		alu.src[0].sel = ctx->temp_reg;
5192		alu.src[0].chan = i;
5193
5194		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5195		if (i == lasti) {
5196			alu.last = 1;
5197		}
5198		r = r600_bytecode_add_alu(ctx->bc, &alu);
5199		if (r)
5200			return r;
5201	}
5202	return 0;
5203}
5204
5205static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5206	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5207	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5208	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5209
5210	/* XXX:
5211	 * For state trackers other than OpenGL, we'll want to use
5212	 * _RECIP_IEEE instead.
5213	 */
5214	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5215
5216	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5217	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5218	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5219	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5220	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5221	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5222	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5223	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5224	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5225	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5226	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5227	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5228	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5229	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5230	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5231	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5232	/* gap */
5233	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5234	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5235	/* gap */
5236	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5237	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5238	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5239	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5240	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5241	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5242	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5243	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5244	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5245	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5246	/* gap */
5247	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5248	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5249	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5250	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5251	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5252	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5253	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5254	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5255	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5256	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5257	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5258	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5259	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5260	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5261	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5262	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5263	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5264	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5265	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5266	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5267	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5268	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5269	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5270	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5271	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5272	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5273	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5274	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5275	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5276	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5277	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5278	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5279	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5280	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5281	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5282	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5283	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5284	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5285	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5286	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5287	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5288	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5289	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5290	/* gap */
5291	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5292	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5293	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5294	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5295	/* gap */
5296	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5298	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5299	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5300	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5301	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5302	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5303	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5304	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5305	/* gap */
5306	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5307	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5308	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5309	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5310	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5311	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5312	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5313	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5314	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5315	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5316	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5317	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5318	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5319	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5320	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5321	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5322	/* gap */
5323	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5324	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5325	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5326	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5327	/* gap */
5328	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5329	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5330	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5331	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5332	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5333	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5334	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5335	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5336	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5337	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5338	/* gap */
5339	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5340	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5341	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5342	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5343	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5344	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5345	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5346	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5347	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5348	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5349	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5350	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5351	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5352	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5353	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5354	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5355	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5356	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5357	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5358	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5359	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5360	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5361	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5362	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5363	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5364	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5365	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5366	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5367	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5368	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5369	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5370	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5371	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5372	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5373	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5374	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5375	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5376	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5377	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5378	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5379	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5380	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5381	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5382	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5383};
5384
5385static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5386	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5387	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5388	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5389	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5390	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5391	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5392	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5393	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5394	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5395	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5396	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5397	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5398	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5399	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5400	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5401	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5402	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5403	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5404	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5405	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5406	/* gap */
5407	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5408	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5409	/* gap */
5410	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5411	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5412	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5413	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5414	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5415	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5416	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5417	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5418	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5419	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5420	/* gap */
5421	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5422	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5423	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5424	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5425	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5426	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5427	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5428	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5429	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5430	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5431	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5432	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5433	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5434	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5435	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5436	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5437	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5438	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5439	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5440	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5441	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5442	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5443	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5444	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5445	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5446	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5447	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5448	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5449	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5450	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5451	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5452	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5453	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5454	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5455	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5456	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5457	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5458	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5459	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5460	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5461	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5462	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5463	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5464	/* gap */
5465	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5466	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5467	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5468	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5469	/* gap */
5470	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5472	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5473	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5474	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5475	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5476	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5477	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5478	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5479	/* gap */
5480	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5481	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5482	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5483	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5484	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5485	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5486	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5487	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5488	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5489	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5490	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5491	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5492	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5493	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5494	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5495	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5496	/* gap */
5497	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5498	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5499	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5500	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	/* gap */
5502	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5503	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5504	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5505	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5506	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5507	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5508	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5509	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5510	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5511	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5512	/* gap */
5513	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5514	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5515	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5516	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5517	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5518	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5519	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5520	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5521	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5522	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5523	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5524	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5525	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5526	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5527	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5528	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5529	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5530	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5531	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5532	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5533	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5534	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5535	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5536	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5537	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5538	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5539	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5540	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5541	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5542	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5543	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5544	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5545	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5546	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5547	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5548	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5549	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5550	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5551	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5552	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5553	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5554	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5555	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5556	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5557};
5558
5559static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5560	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5561	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5562	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5563	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5564	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5565	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5566	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5567	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5568	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5569	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5570	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5571	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5572	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5573	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5574	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5575	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5576	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5577	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5578	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5579	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5580	/* gap */
5581	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5582	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5583	/* gap */
5584	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5585	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5586	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5587	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5588	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5589	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5590	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5591	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5592	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5593	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5594	/* gap */
5595	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5596	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5597	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5598	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5599	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5600	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5601	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5602	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5603	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5604	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5605	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5606	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5607	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5608	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5609	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5610	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5611	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5612	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5613	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5614	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5615	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5616	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5617	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5618	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5619	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5620	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5621	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5622	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5623	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5624	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5625	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5626	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5627	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5628	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5629	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5630	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5631	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5632	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5633	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5634	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5635	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5636	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5637	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5638	/* gap */
5639	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5640	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5641	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5642	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5643	/* gap */
5644	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5646	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5647	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5648	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5649	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5650	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5651	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5652	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5653	/* gap */
5654	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5655	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5656	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5657	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5658	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5659	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5660	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5661	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5662	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5663	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5664	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5665	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5666	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5667	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5668	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5669	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5670	/* gap */
5671	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5672	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5673	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5674	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	/* gap */
5676	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5677	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5678	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5679	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5680	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5681	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5682	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5683	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5684	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5685	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5686	/* gap */
5687	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5688	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5689	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5690	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5691	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5692	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5693	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5694	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5695	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5696	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5697	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5698	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5699	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5700	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5701	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5702	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5703	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5704	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5705	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5706	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5707	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5708	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5709	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5710	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5711	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5712	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5713	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5714	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5715	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5716	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5717	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5718	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5719	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5720	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5721	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5722	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5723	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5724	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5725	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5726	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5727	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5728	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5729	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5730	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5731};
5732