r600_shader.c revision a1a3792b180e453d26bfd09853eee88460dfc466
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_screen *rscreen,
107				 struct r600_pipe_shader *pipeshader,
108				 struct r600_shader_key key);
109
110int r600_pipe_shader_create(struct pipe_context *ctx,
111			    struct r600_pipe_shader *shader,
112			    struct r600_shader_key key)
113{
114	static int dump_shaders = -1;
115	struct r600_context *rctx = (struct r600_context *)ctx;
116	struct r600_pipe_shader_selector *sel = shader->selector;
117	int r;
118
119	/* Would like some magic "get_bool_option_once" routine.
120	*/
121	if (dump_shaders == -1)
122		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
123
124	if (dump_shaders) {
125		fprintf(stderr, "--------------------------------------------------------------\n");
126		tgsi_dump(sel->tokens, 0);
127
128		if (sel->so.num_outputs) {
129			unsigned i;
130			fprintf(stderr, "STREAMOUT\n");
131			for (i = 0; i < sel->so.num_outputs; i++) {
132				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
133						sel->so.output[i].start_component;
134				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
135					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
136				        mask & 1 ? "x" : "_",
137				        (mask >> 1) & 1 ? "y" : "_",
138				        (mask >> 2) & 1 ? "z" : "_",
139				        (mask >> 3) & 1 ? "w" : "_");
140			}
141		}
142	}
143	r = r600_shader_from_tgsi(rctx->screen, shader, key);
144	if (r) {
145		R600_ERR("translation from TGSI failed !\n");
146		return r;
147	}
148	r = r600_bytecode_build(&shader->shader.bc);
149	if (r) {
150		R600_ERR("building bytecode failed !\n");
151		return r;
152	}
153	if (dump_shaders) {
154		r600_bytecode_dump(&shader->shader.bc);
155		fprintf(stderr, "______________________________________________________________\n");
156	}
157	return r600_pipe_shader(ctx, shader);
158}
159
160void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
161{
162	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
163	r600_bytecode_clear(&shader->shader.bc);
164}
165
166/*
167 * tgsi -> r600 shader
168 */
169struct r600_shader_tgsi_instruction;
170
171struct r600_shader_src {
172	unsigned				sel;
173	unsigned				swizzle[4];
174	unsigned				neg;
175	unsigned				abs;
176	unsigned				rel;
177	uint32_t				value[4];
178};
179
180struct r600_shader_ctx {
181	struct tgsi_shader_info			info;
182	struct tgsi_parse_context		parse;
183	const struct tgsi_token			*tokens;
184	unsigned				type;
185	unsigned				file_offset[TGSI_FILE_COUNT];
186	unsigned				temp_reg;
187	struct r600_shader_tgsi_instruction	*inst_info;
188	struct r600_bytecode			*bc;
189	struct r600_shader			*shader;
190	struct r600_shader_src			src[4];
191	uint32_t				*literals;
192	uint32_t				nliterals;
193	uint32_t				max_driver_temp_used;
194	/* needed for evergreen interpolation */
195	boolean                                 input_centroid;
196	boolean                                 input_linear;
197	boolean                                 input_perspective;
198	int					num_interp_gpr;
199	int					face_gpr;
200	int					colors_used;
201	boolean                 clip_vertex_write;
202	unsigned                cv_output;
203	int					fragcoord_input;
204	int					native_integers;
205};
206
207struct r600_shader_tgsi_instruction {
208	unsigned	tgsi_opcode;
209	unsigned	is_op3;
210	unsigned	r600_opcode;
211	int (*process)(struct r600_shader_ctx *ctx);
212};
213
214static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
215static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
216static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
217static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
218static int tgsi_else(struct r600_shader_ctx *ctx);
219static int tgsi_endif(struct r600_shader_ctx *ctx);
220static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
221static int tgsi_endloop(struct r600_shader_ctx *ctx);
222static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
223
224/*
225 * bytestream -> r600 shader
226 *
227 * These functions are used to transform the output of the LLVM backend into
228 * struct r600_bytecode.
229 */
230
231static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
232				unsigned char * bytes,	unsigned num_bytes);
233
234#ifdef HAVE_OPENCL
235int r600_compute_shader_create(struct pipe_context * ctx,
236	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
237{
238	struct r600_context *r600_ctx = (struct r600_context *)ctx;
239	unsigned char * bytes;
240	unsigned byte_count;
241	struct r600_shader_ctx shader_ctx;
242	unsigned dump = 0;
243
244	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
245		dump = 1;
246	}
247
248	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
249	shader_ctx.bc = bytecode;
250	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
251	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
252	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
253	if (shader_ctx.bc->chip_class == CAYMAN) {
254		cm_bytecode_add_cf_end(shader_ctx.bc);
255	}
256	r600_bytecode_build(shader_ctx.bc);
257	if (dump) {
258		r600_bytecode_dump(shader_ctx.bc);
259	}
260	free(bytes);
261	return 1;
262}
263
264#endif /* HAVE_OPENCL */
265
266static uint32_t i32_from_byte_stream(unsigned char * bytes,
267		unsigned * bytes_read)
268{
269	unsigned i;
270	uint32_t out = 0;
271	for (i = 0; i < 4; i++) {
272		out |= bytes[(*bytes_read)++] << (8 * i);
273	}
274	return out;
275}
276
277static unsigned r600_src_from_byte_stream(unsigned char * bytes,
278		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
279{
280	unsigned i;
281	unsigned sel0, sel1;
282	sel0 = bytes[bytes_read++];
283	sel1 = bytes[bytes_read++];
284	alu->src[src_idx].sel = sel0 | (sel1 << 8);
285	alu->src[src_idx].chan = bytes[bytes_read++];
286	alu->src[src_idx].neg = bytes[bytes_read++];
287	alu->src[src_idx].abs = bytes[bytes_read++];
288	alu->src[src_idx].rel = bytes[bytes_read++];
289	alu->src[src_idx].kc_bank = bytes[bytes_read++];
290	for (i = 0; i < 4; i++) {
291		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
292	}
293	return bytes_read;
294}
295
296static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
297				unsigned char * bytes, unsigned bytes_read)
298{
299	unsigned src_idx;
300	struct r600_bytecode_alu alu;
301	unsigned src_const_reg[3];
302	uint32_t word0, word1;
303
304	memset(&alu, 0, sizeof(alu));
305	for(src_idx = 0; src_idx < 3; src_idx++) {
306		unsigned i;
307		src_const_reg[src_idx] = bytes[bytes_read++];
308		for (i = 0; i < 4; i++) {
309			alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
310		}
311	}
312
313	word0 = i32_from_byte_stream(bytes, &bytes_read);
314	word1 = i32_from_byte_stream(bytes, &bytes_read);
315
316	switch(ctx->bc->chip_class) {
317	case R600:
318		r600_bytecode_alu_read(&alu, word0, word1);
319		break;
320	case R700:
321	case EVERGREEN:
322	case CAYMAN:
323		r700_bytecode_alu_read(&alu, word0, word1);
324		break;
325	}
326
327	for(src_idx = 0; src_idx < 3; src_idx++) {
328		if (src_const_reg[src_idx])
329			alu.src[src_idx].sel += 512;
330	}
331
332	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
333	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
334	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
335	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
336		alu.update_pred = 1;
337		alu.dst.write = 0;
338		alu.src[1].sel = V_SQ_ALU_SRC_0;
339		alu.src[1].chan = 0;
340		alu.last = 1;
341	}
342
343	if (alu.execute_mask) {
344		alu.pred_sel = 0;
345		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
346	} else {
347		r600_bytecode_add_alu(ctx->bc, &alu);
348	}
349
350	/* XXX: Handle other KILL instructions */
351	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
352		ctx->shader->uses_kill = 1;
353		/* XXX: This should be enforced in the LLVM backend. */
354		ctx->bc->force_add_cf = 1;
355	}
356	return bytes_read;
357}
358
359static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
360	unsigned pred_inst)
361{
362	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
363	fc_pushlevel(ctx, FC_IF);
364	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
365}
366
367static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
368			struct r600_bytecode_alu *alu, unsigned compare_opcode)
369{
370	unsigned opcode = TGSI_OPCODE_BRK;
371	if (ctx->bc->chip_class == CAYMAN)
372		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
373	else if (ctx->bc->chip_class >= EVERGREEN)
374		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
375	else
376		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
377	llvm_if(ctx, alu, compare_opcode);
378	tgsi_loop_brk_cont(ctx);
379	tgsi_endif(ctx);
380}
381
382static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
383				unsigned char * bytes, unsigned bytes_read)
384{
385	struct r600_bytecode_alu alu;
386	unsigned inst;
387	memset(&alu, 0, sizeof(alu));
388	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
389	inst = bytes[bytes_read++];
390	switch (inst) {
391	case 0: /* FC_IF */
392		llvm_if(ctx, &alu,
393			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
394		break;
395	case 1: /* FC_IF_INT */
396		llvm_if(ctx, &alu,
397			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
398		break;
399	case 2: /* FC_ELSE */
400		tgsi_else(ctx);
401		break;
402	case 3: /* FC_ENDIF */
403		tgsi_endif(ctx);
404		break;
405	case 4: /* FC_BGNLOOP */
406		tgsi_bgnloop(ctx);
407		break;
408	case 5: /* FC_ENDLOOP */
409		tgsi_endloop(ctx);
410		break;
411	case 6: /* FC_BREAK */
412		r600_break_from_byte_stream(ctx, &alu,
413			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
414		break;
415	case 7: /* FC_BREAK_NZ_INT */
416		r600_break_from_byte_stream(ctx, &alu,
417			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
418		break;
419	case 8: /* FC_CONTINUE */
420		{
421			unsigned opcode = TGSI_OPCODE_CONT;
422			if (ctx->bc->chip_class == CAYMAN) {
423				ctx->inst_info =
424					&cm_shader_tgsi_instruction[opcode];
425			} else if (ctx->bc->chip_class >= EVERGREEN) {
426				ctx->inst_info =
427					&eg_shader_tgsi_instruction[opcode];
428			} else {
429				ctx->inst_info =
430					&r600_shader_tgsi_instruction[opcode];
431			}
432			tgsi_loop_brk_cont(ctx);
433		}
434		break;
435	case 9: /* FC_BREAK_Z_INT */
436		r600_break_from_byte_stream(ctx, &alu,
437			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
438		break;
439	case 10: /* FC_BREAK_NZ */
440		r600_break_from_byte_stream(ctx, &alu,
441			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
442		break;
443	}
444
445	return bytes_read;
446}
447
448static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
449				unsigned char * bytes, unsigned bytes_read)
450{
451	struct r600_bytecode_tex tex;
452
453	tex.inst = bytes[bytes_read++];
454	tex.resource_id = bytes[bytes_read++];
455	tex.src_gpr = bytes[bytes_read++];
456	tex.src_rel = bytes[bytes_read++];
457	tex.dst_gpr = bytes[bytes_read++];
458	tex.dst_rel = bytes[bytes_read++];
459	tex.dst_sel_x = bytes[bytes_read++];
460	tex.dst_sel_y = bytes[bytes_read++];
461	tex.dst_sel_z = bytes[bytes_read++];
462	tex.dst_sel_w = bytes[bytes_read++];
463	tex.lod_bias = bytes[bytes_read++];
464	tex.coord_type_x = bytes[bytes_read++];
465	tex.coord_type_y = bytes[bytes_read++];
466	tex.coord_type_z = bytes[bytes_read++];
467	tex.coord_type_w = bytes[bytes_read++];
468	tex.offset_x = bytes[bytes_read++];
469	tex.offset_y = bytes[bytes_read++];
470	tex.offset_z = bytes[bytes_read++];
471	tex.sampler_id = bytes[bytes_read++];
472	tex.src_sel_x = bytes[bytes_read++];
473	tex.src_sel_y = bytes[bytes_read++];
474	tex.src_sel_z = bytes[bytes_read++];
475	tex.src_sel_w = bytes[bytes_read++];
476
477	r600_bytecode_add_tex(ctx->bc, &tex);
478
479	return bytes_read;
480}
481
482static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
483	unsigned char * bytes, unsigned bytes_read)
484{
485	struct r600_bytecode_vtx vtx;
486
487	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
488        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
489	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
490
491	memset(&vtx, 0, sizeof(vtx));
492
493	/* WORD0 */
494	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
495	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
496	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
497	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
498	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
499	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
500
501	/* WORD1 */
502	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
503	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
504	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
505	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
506	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
507	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
508	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
509	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
510	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
511	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
512
513	/* WORD 2*/
514	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
515	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
516
517	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
518		fprintf(stderr, "Error adding vtx\n");
519	}
520	/* Use the Texture Cache */
521	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
522	return bytes_read;
523}
524
525static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
526				unsigned char * bytes,	unsigned num_bytes)
527{
528	unsigned bytes_read = 0;
529	unsigned i, byte;
530	while (bytes_read < num_bytes) {
531		char inst_type = bytes[bytes_read++];
532		switch (inst_type) {
533		case 0:
534			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
535								bytes_read);
536			break;
537		case 1:
538			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
539								bytes_read);
540			break;
541		case 2:
542			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
543								bytes_read);
544			break;
545		case 3:
546			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
547			for (i = 0; i < 2; i++) {
548				for (byte = 0 ; byte < 4; byte++) {
549					ctx->bc->cf_last->isa[i] |=
550					(bytes[bytes_read++] << (byte * 8));
551				}
552			}
553			break;
554
555		case 4:
556			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
557								bytes_read);
558			break;
559		default:
560			/* XXX: Error here */
561			break;
562		}
563	}
564}
565
566/* End bytestream -> r600 shader functions*/
567
568static int tgsi_is_supported(struct r600_shader_ctx *ctx)
569{
570	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
571	int j;
572
573	if (i->Instruction.NumDstRegs > 1) {
574		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
575		return -EINVAL;
576	}
577	if (i->Instruction.Predicate) {
578		R600_ERR("predicate unsupported\n");
579		return -EINVAL;
580	}
581#if 0
582	if (i->Instruction.Label) {
583		R600_ERR("label unsupported\n");
584		return -EINVAL;
585	}
586#endif
587	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
588		if (i->Src[j].Register.Dimension) {
589			R600_ERR("unsupported src %d (dimension %d)\n", j,
590				 i->Src[j].Register.Dimension);
591			return -EINVAL;
592		}
593	}
594	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
595		if (i->Dst[j].Register.Dimension) {
596			R600_ERR("unsupported dst (dimension)\n");
597			return -EINVAL;
598		}
599	}
600	return 0;
601}
602
603static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
604{
605	int i, r;
606	struct r600_bytecode_alu alu;
607	int gpr = 0, base_chan = 0;
608	int ij_index = 0;
609
610	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
611		ij_index = 0;
612		if (ctx->shader->input[input].centroid)
613			ij_index++;
614	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
615		ij_index = 0;
616		/* if we have perspective add one */
617		if (ctx->input_perspective)  {
618			ij_index++;
619			/* if we have perspective centroid */
620			if (ctx->input_centroid)
621				ij_index++;
622		}
623		if (ctx->shader->input[input].centroid)
624			ij_index++;
625	}
626
627	/* work out gpr and base_chan from index */
628	gpr = ij_index / 2;
629	base_chan = (2 * (ij_index % 2)) + 1;
630
631	for (i = 0; i < 8; i++) {
632		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
633
634		if (i < 4)
635			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
636		else
637			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
638
639		if ((i > 1) && (i < 6)) {
640			alu.dst.sel = ctx->shader->input[input].gpr;
641			alu.dst.write = 1;
642		}
643
644		alu.dst.chan = i % 4;
645
646		alu.src[0].sel = gpr;
647		alu.src[0].chan = (base_chan - (i % 2));
648
649		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
650
651		alu.bank_swizzle_force = SQ_ALU_VEC_210;
652		if ((i % 4) == 3)
653			alu.last = 1;
654		r = r600_bytecode_add_alu(ctx->bc, &alu);
655		if (r)
656			return r;
657	}
658	return 0;
659}
660
661static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
662{
663	int i, r;
664	struct r600_bytecode_alu alu;
665
666	for (i = 0; i < 4; i++) {
667		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
668
669		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
670
671		alu.dst.sel = ctx->shader->input[input].gpr;
672		alu.dst.write = 1;
673
674		alu.dst.chan = i;
675
676		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
677		alu.src[0].chan = i;
678
679		if (i == 3)
680			alu.last = 1;
681		r = r600_bytecode_add_alu(ctx->bc, &alu);
682		if (r)
683			return r;
684	}
685	return 0;
686}
687
688/*
689 * Special export handling in shaders
690 *
691 * shader export ARRAY_BASE for EXPORT_POS:
692 * 60 is position
693 * 61 is misc vector
694 * 62, 63 are clip distance vectors
695 *
696 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
697 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
698 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
699 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
700 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
701 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
702 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
703 * exclusive from render target index)
704 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
705 *
706 *
707 * shader export ARRAY_BASE for EXPORT_PIXEL:
708 * 0-7 CB targets
709 * 61 computed Z vector
710 *
711 * The use of the values exported in the computed Z vector are controlled
712 * by DB_SHADER_CONTROL:
713 * Z_EXPORT_ENABLE - Z as a float in RED
714 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
715 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
716 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
717 * DB_SOURCE_FORMAT - export control restrictions
718 *
719 */
720
721
722/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
723static int r600_spi_sid(struct r600_shader_io * io)
724{
725	int index, name = io->name;
726
727	/* These params are handled differently, they don't need
728	 * semantic indices, so we'll use 0 for them.
729	 */
730	if (name == TGSI_SEMANTIC_POSITION ||
731		name == TGSI_SEMANTIC_PSIZE ||
732		name == TGSI_SEMANTIC_FACE)
733		index = 0;
734	else {
735		if (name == TGSI_SEMANTIC_GENERIC) {
736			/* For generic params simply use sid from tgsi */
737			index = io->sid;
738		} else {
739			/* For non-generic params - pack name and sid into 8 bits */
740			index = 0x80 | (name<<3) | (io->sid);
741		}
742
743		/* Make sure that all really used indices have nonzero value, so
744		 * we can just compare it to 0 later instead of comparing the name
745		 * with different values to detect special cases. */
746		index++;
747	}
748
749	return index;
750};
751
752/* turn input into interpolate on EG */
753static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
754{
755	int r = 0;
756
757	if (ctx->shader->input[index].spi_sid) {
758		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
759		if (ctx->shader->input[index].interpolate > 0) {
760			r = evergreen_interp_alu(ctx, index);
761		} else {
762			r = evergreen_interp_flat(ctx, index);
763		}
764	}
765	return r;
766}
767
768static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
769{
770	struct r600_bytecode_alu alu;
771	int i, r;
772	int gpr_front = ctx->shader->input[front].gpr;
773	int gpr_back = ctx->shader->input[back].gpr;
774
775	for (i = 0; i < 4; i++) {
776		memset(&alu, 0, sizeof(alu));
777		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
778		alu.is_op3 = 1;
779		alu.dst.write = 1;
780		alu.dst.sel = gpr_front;
781		alu.src[0].sel = ctx->face_gpr;
782		alu.src[1].sel = gpr_front;
783		alu.src[2].sel = gpr_back;
784
785		alu.dst.chan = i;
786		alu.src[1].chan = i;
787		alu.src[2].chan = i;
788		alu.last = (i==3);
789
790		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
791			return r;
792	}
793
794	return 0;
795}
796
797static int tgsi_declaration(struct r600_shader_ctx *ctx)
798{
799	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
800	unsigned i;
801	int r;
802
803	switch (d->Declaration.File) {
804	case TGSI_FILE_INPUT:
805		i = ctx->shader->ninput++;
806		ctx->shader->input[i].name = d->Semantic.Name;
807		ctx->shader->input[i].sid = d->Semantic.Index;
808		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
809		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
810		ctx->shader->input[i].centroid = d->Interp.Centroid;
811		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
812		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
813			switch (ctx->shader->input[i].name) {
814			case TGSI_SEMANTIC_FACE:
815				ctx->face_gpr = ctx->shader->input[i].gpr;
816				break;
817			case TGSI_SEMANTIC_COLOR:
818				ctx->colors_used++;
819				break;
820			case TGSI_SEMANTIC_POSITION:
821				ctx->fragcoord_input = i;
822				break;
823			}
824			if (ctx->bc->chip_class >= EVERGREEN) {
825				if ((r = evergreen_interp_input(ctx, i)))
826					return r;
827			}
828		}
829		break;
830	case TGSI_FILE_OUTPUT:
831		i = ctx->shader->noutput++;
832		ctx->shader->output[i].name = d->Semantic.Name;
833		ctx->shader->output[i].sid = d->Semantic.Index;
834		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
835		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
836		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
837		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
838		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
839			switch (d->Semantic.Name) {
840			case TGSI_SEMANTIC_CLIPDIST:
841				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
842				break;
843			case TGSI_SEMANTIC_PSIZE:
844				ctx->shader->vs_out_misc_write = 1;
845				ctx->shader->vs_out_point_size = 1;
846				break;
847			case TGSI_SEMANTIC_CLIPVERTEX:
848				ctx->clip_vertex_write = TRUE;
849				ctx->cv_output = i;
850				break;
851			}
852		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
853			switch (d->Semantic.Name) {
854			case TGSI_SEMANTIC_COLOR:
855				ctx->shader->nr_ps_max_color_exports++;
856				break;
857			}
858		}
859		break;
860	case TGSI_FILE_CONSTANT:
861	case TGSI_FILE_TEMPORARY:
862	case TGSI_FILE_SAMPLER:
863	case TGSI_FILE_ADDRESS:
864		break;
865
866	case TGSI_FILE_SYSTEM_VALUE:
867		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
868			if (!ctx->native_integers) {
869				struct r600_bytecode_alu alu;
870				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
871
872				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
873				alu.src[0].sel = 0;
874				alu.src[0].chan = 3;
875
876				alu.dst.sel = 0;
877				alu.dst.chan = 3;
878				alu.dst.write = 1;
879				alu.last = 1;
880
881				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
882					return r;
883			}
884			break;
885		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
886			break;
887	default:
888		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
889		return -EINVAL;
890	}
891	return 0;
892}
893
894static int r600_get_temp(struct r600_shader_ctx *ctx)
895{
896	return ctx->temp_reg + ctx->max_driver_temp_used++;
897}
898
899/*
900 * for evergreen we need to scan the shader to find the number of GPRs we need to
901 * reserve for interpolation.
902 *
903 * we need to know if we are going to emit
904 * any centroid inputs
905 * if perspective and linear are required
906*/
907static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
908{
909	int i;
910	int num_baryc;
911
912	ctx->input_linear = FALSE;
913	ctx->input_perspective = FALSE;
914	ctx->input_centroid = FALSE;
915	ctx->num_interp_gpr = 1;
916
917	/* any centroid inputs */
918	for (i = 0; i < ctx->info.num_inputs; i++) {
919		/* skip position/face */
920		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
921		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
922			continue;
923		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
924			ctx->input_linear = TRUE;
925		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
926			ctx->input_perspective = TRUE;
927		if (ctx->info.input_centroid[i])
928			ctx->input_centroid = TRUE;
929	}
930
931	num_baryc = 0;
932	/* ignoring sample for now */
933	if (ctx->input_perspective)
934		num_baryc++;
935	if (ctx->input_linear)
936		num_baryc++;
937	if (ctx->input_centroid)
938		num_baryc *= 2;
939
940	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
941
942	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
943	return ctx->num_interp_gpr;
944}
945
946static void tgsi_src(struct r600_shader_ctx *ctx,
947		     const struct tgsi_full_src_register *tgsi_src,
948		     struct r600_shader_src *r600_src)
949{
950	memset(r600_src, 0, sizeof(*r600_src));
951	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
952	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
953	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
954	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
955	r600_src->neg = tgsi_src->Register.Negate;
956	r600_src->abs = tgsi_src->Register.Absolute;
957
958	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
959		int index;
960		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
961			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
962			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
963
964			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
965			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
966			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
967				return;
968		}
969		index = tgsi_src->Register.Index;
970		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
971		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
972	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
973		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
974			r600_src->swizzle[0] = 3;
975			r600_src->swizzle[1] = 3;
976			r600_src->swizzle[2] = 3;
977			r600_src->swizzle[3] = 3;
978			r600_src->sel = 0;
979		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
980			r600_src->swizzle[0] = 0;
981			r600_src->swizzle[1] = 0;
982			r600_src->swizzle[2] = 0;
983			r600_src->swizzle[3] = 0;
984			r600_src->sel = 0;
985		}
986	} else {
987		if (tgsi_src->Register.Indirect)
988			r600_src->rel = V_SQ_REL_RELATIVE;
989		r600_src->sel = tgsi_src->Register.Index;
990		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
991	}
992}
993
994static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
995{
996	struct r600_bytecode_vtx vtx;
997	unsigned int ar_reg;
998	int r;
999
1000	if (offset) {
1001		struct r600_bytecode_alu alu;
1002
1003		memset(&alu, 0, sizeof(alu));
1004
1005		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
1006		alu.src[0].sel = ctx->bc->ar_reg;
1007
1008		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1009		alu.src[1].value = offset;
1010
1011		alu.dst.sel = dst_reg;
1012		alu.dst.write = 1;
1013		alu.last = 1;
1014
1015		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1016			return r;
1017
1018		ar_reg = dst_reg;
1019	} else {
1020		ar_reg = ctx->bc->ar_reg;
1021	}
1022
1023	memset(&vtx, 0, sizeof(vtx));
1024	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1025	vtx.src_gpr = ar_reg;
1026	vtx.mega_fetch_count = 16;
1027	vtx.dst_gpr = dst_reg;
1028	vtx.dst_sel_x = 0;		/* SEL_X */
1029	vtx.dst_sel_y = 1;		/* SEL_Y */
1030	vtx.dst_sel_z = 2;		/* SEL_Z */
1031	vtx.dst_sel_w = 3;		/* SEL_W */
1032	vtx.data_format = FMT_32_32_32_32_FLOAT;
1033	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1034	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1035	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1036	vtx.endian = r600_endian_swap(32);
1037
1038	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1039		return r;
1040
1041	return 0;
1042}
1043
1044static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1045{
1046	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1047	struct r600_bytecode_alu alu;
1048	int i, j, k, nconst, r;
1049
1050	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1051		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1052			nconst++;
1053		}
1054		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1055	}
1056	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1057		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1058			continue;
1059		}
1060
1061		if (ctx->src[i].rel) {
1062			int treg = r600_get_temp(ctx);
1063			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1064				return r;
1065
1066			ctx->src[i].sel = treg;
1067			ctx->src[i].rel = 0;
1068			j--;
1069		} else if (j > 0) {
1070			int treg = r600_get_temp(ctx);
1071			for (k = 0; k < 4; k++) {
1072				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1073				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1074				alu.src[0].sel = ctx->src[i].sel;
1075				alu.src[0].chan = k;
1076				alu.src[0].rel = ctx->src[i].rel;
1077				alu.dst.sel = treg;
1078				alu.dst.chan = k;
1079				alu.dst.write = 1;
1080				if (k == 3)
1081					alu.last = 1;
1082				r = r600_bytecode_add_alu(ctx->bc, &alu);
1083				if (r)
1084					return r;
1085			}
1086			ctx->src[i].sel = treg;
1087			ctx->src[i].rel =0;
1088			j--;
1089		}
1090	}
1091	return 0;
1092}
1093
1094/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1095static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1096{
1097	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1098	struct r600_bytecode_alu alu;
1099	int i, j, k, nliteral, r;
1100
1101	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1102		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1103			nliteral++;
1104		}
1105	}
1106	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1107		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1108			int treg = r600_get_temp(ctx);
1109			for (k = 0; k < 4; k++) {
1110				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1111				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1112				alu.src[0].sel = ctx->src[i].sel;
1113				alu.src[0].chan = k;
1114				alu.src[0].value = ctx->src[i].value[k];
1115				alu.dst.sel = treg;
1116				alu.dst.chan = k;
1117				alu.dst.write = 1;
1118				if (k == 3)
1119					alu.last = 1;
1120				r = r600_bytecode_add_alu(ctx->bc, &alu);
1121				if (r)
1122					return r;
1123			}
1124			ctx->src[i].sel = treg;
1125			j--;
1126		}
1127	}
1128	return 0;
1129}
1130
1131static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1132{
1133	int i, r, count = ctx->shader->ninput;
1134
1135	/* additional inputs will be allocated right after the existing inputs,
1136	 * we won't need them after the color selection, so we don't need to
1137	 * reserve these gprs for the rest of the shader code and to adjust
1138	 * output offsets etc. */
1139	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
1140			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
1141
1142	if (ctx->face_gpr == -1) {
1143		i = ctx->shader->ninput++;
1144		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
1145		ctx->shader->input[i].spi_sid = 0;
1146		ctx->shader->input[i].gpr = gpr++;
1147		ctx->face_gpr = ctx->shader->input[i].gpr;
1148	}
1149
1150	for (i = 0; i < count; i++) {
1151		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1152			int ni = ctx->shader->ninput++;
1153			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
1154			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1155			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
1156			ctx->shader->input[ni].gpr = gpr++;
1157
1158			if (ctx->bc->chip_class >= EVERGREEN) {
1159				r = evergreen_interp_input(ctx, ni);
1160				if (r)
1161					return r;
1162			}
1163
1164			r = select_twoside_color(ctx, i, ni);
1165			if (r)
1166				return r;
1167		}
1168	}
1169	return 0;
1170}
1171
1172static int r600_shader_from_tgsi(struct r600_screen *rscreen,
1173				 struct r600_pipe_shader *pipeshader,
1174				 struct r600_shader_key key)
1175{
1176	struct r600_shader *shader = &pipeshader->shader;
1177	struct tgsi_token *tokens = pipeshader->selector->tokens;
1178	struct pipe_stream_output_info so = pipeshader->selector->so;
1179	struct tgsi_full_immediate *immediate;
1180	struct tgsi_full_property *property;
1181	struct r600_shader_ctx ctx;
1182	struct r600_bytecode_output output[32];
1183	unsigned output_done, noutput;
1184	unsigned opcode;
1185	int i, j, k, r = 0;
1186	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1187	/* Declarations used by llvm code */
1188	bool use_llvm = false;
1189	unsigned char * inst_bytes = NULL;
1190	unsigned inst_byte_count = 0;
1191
1192#ifdef R600_USE_LLVM
1193	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1194#endif
1195	ctx.bc = &shader->bc;
1196	ctx.shader = shader;
1197	ctx.native_integers = true;
1198
1199	r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family);
1200	ctx.tokens = tokens;
1201	tgsi_scan_shader(tokens, &ctx.info);
1202	tgsi_parse_init(&ctx.parse, tokens);
1203	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1204	shader->processor_type = ctx.type;
1205	ctx.bc->type = shader->processor_type;
1206
1207	ctx.face_gpr = -1;
1208	ctx.fragcoord_input = -1;
1209	ctx.colors_used = 0;
1210	ctx.clip_vertex_write = 0;
1211
1212	shader->nr_ps_color_exports = 0;
1213	shader->nr_ps_max_color_exports = 0;
1214
1215	shader->two_side = key.color_two_side;
1216
1217	/* register allocations */
1218	/* Values [0,127] correspond to GPR[0..127].
1219	 * Values [128,159] correspond to constant buffer bank 0
1220	 * Values [160,191] correspond to constant buffer bank 1
1221	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1222	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1223	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1224	 * Other special values are shown in the list below.
1225	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1226	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1227	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1228	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1229	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1230	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1231	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1232	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1233	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1234	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1235	 * 254	SQ_ALU_SRC_PV: previous vector result.
1236	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1237	 */
1238	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1239		ctx.file_offset[i] = 0;
1240	}
1241	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1242		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1243		if (ctx.bc->chip_class >= EVERGREEN) {
1244			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1245		} else {
1246			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1247		}
1248	}
1249	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1250		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1251	}
1252
1253#ifdef R600_USE_LLVM
1254	if (use_llvm && ctx.info.indirect_files) {
1255		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1256				"indirect adressing.  Falling back to TGSI "
1257				"backend.\n");
1258		use_llvm = 0;
1259	}
1260#endif
1261
1262	if (use_llvm) {
1263		ctx.file_offset[TGSI_FILE_OUTPUT] =
1264			ctx.file_offset[TGSI_FILE_INPUT];
1265	} else {
1266	   ctx.file_offset[TGSI_FILE_OUTPUT] =
1267			ctx.file_offset[TGSI_FILE_INPUT] +
1268			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1269	}
1270	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1271						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1272
1273	/* Outside the GPR range. This will be translated to one of the
1274	 * kcache banks later. */
1275	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1276
1277	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1278	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1279			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1280	ctx.temp_reg = ctx.bc->ar_reg + 1;
1281
1282	ctx.nliterals = 0;
1283	ctx.literals = NULL;
1284	shader->fs_write_all = FALSE;
1285	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1286		tgsi_parse_token(&ctx.parse);
1287		switch (ctx.parse.FullToken.Token.Type) {
1288		case TGSI_TOKEN_TYPE_IMMEDIATE:
1289			immediate = &ctx.parse.FullToken.FullImmediate;
1290			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1291			if(ctx.literals == NULL) {
1292				r = -ENOMEM;
1293				goto out_err;
1294			}
1295			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1296			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1297			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1298			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1299			ctx.nliterals++;
1300			break;
1301		case TGSI_TOKEN_TYPE_DECLARATION:
1302			r = tgsi_declaration(&ctx);
1303			if (r)
1304				goto out_err;
1305			break;
1306		case TGSI_TOKEN_TYPE_INSTRUCTION:
1307			break;
1308		case TGSI_TOKEN_TYPE_PROPERTY:
1309			property = &ctx.parse.FullToken.FullProperty;
1310			switch (property->Property.PropertyName) {
1311			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1312				if (property->u[0].Data == 1)
1313					shader->fs_write_all = TRUE;
1314				break;
1315			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1316				/* we don't need this one */
1317				break;
1318			}
1319			break;
1320		default:
1321			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1322			r = -EINVAL;
1323			goto out_err;
1324		}
1325	}
1326
1327/* LLVM backend setup */
1328#ifdef R600_USE_LLVM
1329	if (use_llvm) {
1330		struct radeon_llvm_context radeon_llvm_ctx;
1331		LLVMModuleRef mod;
1332		unsigned dump = 0;
1333		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1334		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1335		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1336		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1337			dump = 1;
1338		}
1339		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1340							rscreen->family, dump)) {
1341			FREE(inst_bytes);
1342			radeon_llvm_dispose(&radeon_llvm_ctx);
1343			use_llvm = 0;
1344			fprintf(stderr, "R600 LLVM backend failed to compile "
1345				"shader.  Falling back to TGSI\n");
1346		} else {
1347			ctx.file_offset[TGSI_FILE_OUTPUT] =
1348					ctx.file_offset[TGSI_FILE_INPUT];
1349		}
1350		radeon_llvm_dispose(&radeon_llvm_ctx);
1351	}
1352#endif
1353/* End of LLVM backend setup */
1354
1355	if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN)
1356		shader->nr_ps_max_color_exports = 8;
1357
1358	if (ctx.fragcoord_input >= 0) {
1359		if (ctx.bc->chip_class == CAYMAN) {
1360			for (j = 0 ; j < 4; j++) {
1361				struct r600_bytecode_alu alu;
1362				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1363				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1364				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1365				alu.src[0].chan = 3;
1366
1367				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1368				alu.dst.chan = j;
1369				alu.dst.write = (j == 3);
1370				alu.last = 1;
1371				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1372					return r;
1373			}
1374		} else {
1375			struct r600_bytecode_alu alu;
1376			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1377			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1378			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1379			alu.src[0].chan = 3;
1380
1381			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1382			alu.dst.chan = 3;
1383			alu.dst.write = 1;
1384			alu.last = 1;
1385			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1386				return r;
1387		}
1388	}
1389
1390	if (shader->two_side && ctx.colors_used) {
1391		if ((r = process_twoside_color_inputs(&ctx)))
1392			return r;
1393	}
1394
1395	tgsi_parse_init(&ctx.parse, tokens);
1396	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1397		tgsi_parse_token(&ctx.parse);
1398		switch (ctx.parse.FullToken.Token.Type) {
1399		case TGSI_TOKEN_TYPE_INSTRUCTION:
1400			if (use_llvm) {
1401				continue;
1402			}
1403			r = tgsi_is_supported(&ctx);
1404			if (r)
1405				goto out_err;
1406			ctx.max_driver_temp_used = 0;
1407			/* reserve first tmp for everyone */
1408			r600_get_temp(&ctx);
1409
1410			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1411			if ((r = tgsi_split_constant(&ctx)))
1412				goto out_err;
1413			if ((r = tgsi_split_literal_constant(&ctx)))
1414				goto out_err;
1415			if (ctx.bc->chip_class == CAYMAN)
1416				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1417			else if (ctx.bc->chip_class >= EVERGREEN)
1418				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1419			else
1420				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1421			r = ctx.inst_info->process(&ctx);
1422			if (r)
1423				goto out_err;
1424			break;
1425		default:
1426			break;
1427		}
1428	}
1429
1430	/* Get instructions if we are using the LLVM backend. */
1431	if (use_llvm) {
1432		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1433		FREE(inst_bytes);
1434	}
1435
1436	noutput = shader->noutput;
1437
1438	if (ctx.clip_vertex_write) {
1439		/* need to convert a clipvertex write into clipdistance writes and not export
1440		   the clip vertex anymore */
1441
1442		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1443		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1444		shader->output[noutput].gpr = ctx.temp_reg;
1445		noutput++;
1446		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1447		shader->output[noutput].gpr = ctx.temp_reg+1;
1448		noutput++;
1449
1450		/* reset spi_sid for clipvertex output to avoid confusing spi */
1451		shader->output[ctx.cv_output].spi_sid = 0;
1452
1453		shader->clip_dist_write = 0xFF;
1454
1455		for (i = 0; i < 8; i++) {
1456			int oreg = i >> 2;
1457			int ochan = i & 3;
1458
1459			for (j = 0; j < 4; j++) {
1460				struct r600_bytecode_alu alu;
1461				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1462				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1463				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1464				alu.src[0].chan = j;
1465
1466				alu.src[1].sel = 512 + i;
1467				alu.src[1].kc_bank = 1;
1468				alu.src[1].chan = j;
1469
1470				alu.dst.sel = ctx.temp_reg + oreg;
1471				alu.dst.chan = j;
1472				alu.dst.write = (j == ochan);
1473				if (j == 3)
1474					alu.last = 1;
1475				r = r600_bytecode_add_alu(ctx.bc, &alu);
1476				if (r)
1477					return r;
1478			}
1479		}
1480	}
1481
1482	/* Add stream outputs. */
1483	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1484		for (i = 0; i < so.num_outputs; i++) {
1485			struct r600_bytecode_output output;
1486
1487			if (so.output[i].output_buffer >= 4) {
1488				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1489					 so.output[i].output_buffer);
1490				r = -EINVAL;
1491				goto out_err;
1492			}
1493			if (so.output[i].dst_offset < so.output[i].start_component) {
1494			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1495			   r = -EINVAL;
1496			   goto out_err;
1497			}
1498
1499			memset(&output, 0, sizeof(struct r600_bytecode_output));
1500			output.gpr = shader->output[so.output[i].register_index].gpr;
1501			output.elem_size = 0;
1502			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1503			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1504			output.burst_count = 1;
1505			output.barrier = 1;
1506			/* array_size is an upper limit for the burst_count
1507			 * with MEM_STREAM instructions */
1508			output.array_size = 0xFFF;
1509			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1510			if (ctx.bc->chip_class >= EVERGREEN) {
1511				switch (so.output[i].output_buffer) {
1512				case 0:
1513					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1514					break;
1515				case 1:
1516					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1517					break;
1518				case 2:
1519					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1520					break;
1521				case 3:
1522					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1523					break;
1524				}
1525			} else {
1526				switch (so.output[i].output_buffer) {
1527				case 0:
1528					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1529					break;
1530				case 1:
1531					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1532					break;
1533				case 2:
1534					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1535					break;
1536				case 3:
1537					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1538					break;
1539				}
1540			}
1541			r = r600_bytecode_add_output(ctx.bc, &output);
1542			if (r)
1543				goto out_err;
1544		}
1545	}
1546
1547	/* export output */
1548	for (i = 0, j = 0; i < noutput; i++, j++) {
1549		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1550		output[j].gpr = shader->output[i].gpr;
1551		output[j].elem_size = 3;
1552		output[j].swizzle_x = 0;
1553		output[j].swizzle_y = 1;
1554		output[j].swizzle_z = 2;
1555		output[j].swizzle_w = 3;
1556		output[j].burst_count = 1;
1557		output[j].barrier = 1;
1558		output[j].type = -1;
1559		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1560		switch (ctx.type) {
1561		case TGSI_PROCESSOR_VERTEX:
1562			switch (shader->output[i].name) {
1563			case TGSI_SEMANTIC_POSITION:
1564				output[j].array_base = next_pos_base++;
1565				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1566				break;
1567
1568			case TGSI_SEMANTIC_PSIZE:
1569				output[j].array_base = next_pos_base++;
1570				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1571				break;
1572			case TGSI_SEMANTIC_CLIPVERTEX:
1573				j--;
1574				break;
1575			case TGSI_SEMANTIC_CLIPDIST:
1576				output[j].array_base = next_pos_base++;
1577				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1578				/* spi_sid is 0 for clipdistance outputs that were generated
1579				 * for clipvertex - we don't need to pass them to PS */
1580				if (shader->output[i].spi_sid) {
1581					j++;
1582					/* duplicate it as PARAM to pass to the pixel shader */
1583					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1584					output[j].array_base = next_param_base++;
1585					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1586				}
1587				break;
1588			case TGSI_SEMANTIC_FOG:
1589				output[j].swizzle_y = 4; /* 0 */
1590				output[j].swizzle_z = 4; /* 0 */
1591				output[j].swizzle_w = 5; /* 1 */
1592				break;
1593			}
1594			break;
1595		case TGSI_PROCESSOR_FRAGMENT:
1596			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1597				/* never export more colors than the number of CBs */
1598				if (next_pixel_base && next_pixel_base >= key.nr_cbufs + key.dual_src_blend) {
1599					/* skip export */
1600					j--;
1601					continue;
1602				}
1603				output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1604				output[j].array_base = next_pixel_base++;
1605				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1606				shader->nr_ps_color_exports++;
1607				if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
1608					for (k = 1; k < key.nr_cbufs; k++) {
1609						j++;
1610						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1611						output[j].gpr = shader->output[i].gpr;
1612						output[j].elem_size = 3;
1613						output[j].swizzle_x = 0;
1614						output[j].swizzle_y = 1;
1615						output[j].swizzle_z = 2;
1616						output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1617						output[j].burst_count = 1;
1618						output[j].barrier = 1;
1619						output[j].array_base = next_pixel_base++;
1620						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1621						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1622						shader->nr_ps_color_exports++;
1623					}
1624				}
1625			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1626				output[j].array_base = 61;
1627				output[j].swizzle_x = 2;
1628				output[j].swizzle_y = 7;
1629				output[j].swizzle_z = output[j].swizzle_w = 7;
1630				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1631			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1632				output[j].array_base = 61;
1633				output[j].swizzle_x = 7;
1634				output[j].swizzle_y = 1;
1635				output[j].swizzle_z = output[j].swizzle_w = 7;
1636				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1637			} else {
1638				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1639				r = -EINVAL;
1640				goto out_err;
1641			}
1642			break;
1643		default:
1644			R600_ERR("unsupported processor type %d\n", ctx.type);
1645			r = -EINVAL;
1646			goto out_err;
1647		}
1648
1649		if (output[j].type==-1) {
1650			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1651			output[j].array_base = next_param_base++;
1652		}
1653	}
1654
1655	/* add fake param output for vertex shader if no param is exported */
1656	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1657			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1658			output[j].gpr = 0;
1659			output[j].elem_size = 3;
1660			output[j].swizzle_x = 7;
1661			output[j].swizzle_y = 7;
1662			output[j].swizzle_z = 7;
1663			output[j].swizzle_w = 7;
1664			output[j].burst_count = 1;
1665			output[j].barrier = 1;
1666			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1667			output[j].array_base = 0;
1668			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1669			j++;
1670	}
1671
1672	/* add fake pixel export */
1673	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1674		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1675		output[j].gpr = 0;
1676		output[j].elem_size = 3;
1677		output[j].swizzle_x = 7;
1678		output[j].swizzle_y = 7;
1679		output[j].swizzle_z = 7;
1680		output[j].swizzle_w = 7;
1681		output[j].burst_count = 1;
1682		output[j].barrier = 1;
1683		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1684		output[j].array_base = 0;
1685		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1686		j++;
1687	}
1688
1689	noutput = j;
1690
1691	/* set export done on last export of each type */
1692	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1693		if (ctx.bc->chip_class < CAYMAN) {
1694			if (i == (noutput - 1)) {
1695				output[i].end_of_program = 1;
1696			}
1697		}
1698		if (!(output_done & (1 << output[i].type))) {
1699			output_done |= (1 << output[i].type);
1700			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1701		}
1702	}
1703	/* add output to bytecode */
1704	for (i = 0; i < noutput; i++) {
1705		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1706		if (r)
1707			goto out_err;
1708	}
1709	/* add program end */
1710	if (ctx.bc->chip_class == CAYMAN)
1711		cm_bytecode_add_cf_end(ctx.bc);
1712
1713	/* check GPR limit - we have 124 = 128 - 4
1714	 * (4 are reserved as alu clause temporary registers) */
1715	if (ctx.bc->ngpr > 124) {
1716		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1717		r = -ENOMEM;
1718		goto out_err;
1719	}
1720
1721	free(ctx.literals);
1722	tgsi_parse_free(&ctx.parse);
1723	return 0;
1724out_err:
1725	free(ctx.literals);
1726	tgsi_parse_free(&ctx.parse);
1727	return r;
1728}
1729
1730static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1731{
1732	R600_ERR("%s tgsi opcode unsupported\n",
1733		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1734	return -EINVAL;
1735}
1736
1737static int tgsi_end(struct r600_shader_ctx *ctx)
1738{
1739	return 0;
1740}
1741
1742static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1743			const struct r600_shader_src *shader_src,
1744			unsigned chan)
1745{
1746	bc_src->sel = shader_src->sel;
1747	bc_src->chan = shader_src->swizzle[chan];
1748	bc_src->neg = shader_src->neg;
1749	bc_src->abs = shader_src->abs;
1750	bc_src->rel = shader_src->rel;
1751	bc_src->value = shader_src->value[bc_src->chan];
1752}
1753
1754static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1755{
1756	bc_src->abs = 1;
1757	bc_src->neg = 0;
1758}
1759
1760static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1761{
1762	bc_src->neg = !bc_src->neg;
1763}
1764
1765static void tgsi_dst(struct r600_shader_ctx *ctx,
1766		     const struct tgsi_full_dst_register *tgsi_dst,
1767		     unsigned swizzle,
1768		     struct r600_bytecode_alu_dst *r600_dst)
1769{
1770	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1771
1772	r600_dst->sel = tgsi_dst->Register.Index;
1773	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1774	r600_dst->chan = swizzle;
1775	r600_dst->write = 1;
1776	if (tgsi_dst->Register.Indirect)
1777		r600_dst->rel = V_SQ_REL_RELATIVE;
1778	if (inst->Instruction.Saturate) {
1779		r600_dst->clamp = 1;
1780	}
1781}
1782
1783static int tgsi_last_instruction(unsigned writemask)
1784{
1785	int i, lasti = 0;
1786
1787	for (i = 0; i < 4; i++) {
1788		if (writemask & (1 << i)) {
1789			lasti = i;
1790		}
1791	}
1792	return lasti;
1793}
1794
1795static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1796{
1797	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1798	struct r600_bytecode_alu alu;
1799	int i, j, r;
1800	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1801
1802	for (i = 0; i < lasti + 1; i++) {
1803		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1804			continue;
1805
1806		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1807		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1808
1809		alu.inst = ctx->inst_info->r600_opcode;
1810		if (!swap) {
1811			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1812				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1813			}
1814		} else {
1815			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1816			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1817		}
1818		/* handle some special cases */
1819		switch (ctx->inst_info->tgsi_opcode) {
1820		case TGSI_OPCODE_SUB:
1821			r600_bytecode_src_toggle_neg(&alu.src[1]);
1822			break;
1823		case TGSI_OPCODE_ABS:
1824			r600_bytecode_src_set_abs(&alu.src[0]);
1825			break;
1826		default:
1827			break;
1828		}
1829		if (i == lasti || trans_only) {
1830			alu.last = 1;
1831		}
1832		r = r600_bytecode_add_alu(ctx->bc, &alu);
1833		if (r)
1834			return r;
1835	}
1836	return 0;
1837}
1838
1839static int tgsi_op2(struct r600_shader_ctx *ctx)
1840{
1841	return tgsi_op2_s(ctx, 0, 0);
1842}
1843
1844static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1845{
1846	return tgsi_op2_s(ctx, 1, 0);
1847}
1848
1849static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1850{
1851	return tgsi_op2_s(ctx, 0, 1);
1852}
1853
1854static int tgsi_ineg(struct r600_shader_ctx *ctx)
1855{
1856	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1857	struct r600_bytecode_alu alu;
1858	int i, r;
1859	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1860
1861	for (i = 0; i < lasti + 1; i++) {
1862
1863		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1864			continue;
1865		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1866		alu.inst = ctx->inst_info->r600_opcode;
1867
1868		alu.src[0].sel = V_SQ_ALU_SRC_0;
1869
1870		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1871
1872		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1873
1874		if (i == lasti) {
1875			alu.last = 1;
1876		}
1877		r = r600_bytecode_add_alu(ctx->bc, &alu);
1878		if (r)
1879			return r;
1880	}
1881	return 0;
1882
1883}
1884
1885static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1886{
1887	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1888	int i, j, r;
1889	struct r600_bytecode_alu alu;
1890	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1891
1892	for (i = 0 ; i < last_slot; i++) {
1893		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1894		alu.inst = ctx->inst_info->r600_opcode;
1895		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1896			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1897
1898			/* RSQ should take the absolute value of src */
1899			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
1900				r600_bytecode_src_set_abs(&alu.src[j]);
1901			}
1902		}
1903		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1904		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1905
1906		if (i == last_slot - 1)
1907			alu.last = 1;
1908		r = r600_bytecode_add_alu(ctx->bc, &alu);
1909		if (r)
1910			return r;
1911	}
1912	return 0;
1913}
1914
1915static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1916{
1917	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1918	int i, j, k, r;
1919	struct r600_bytecode_alu alu;
1920	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1921	for (k = 0; k < last_slot; k++) {
1922		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1923			continue;
1924
1925		for (i = 0 ; i < 4; i++) {
1926			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1927			alu.inst = ctx->inst_info->r600_opcode;
1928			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1929				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1930			}
1931			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1932			alu.dst.write = (i == k);
1933			if (i == 3)
1934				alu.last = 1;
1935			r = r600_bytecode_add_alu(ctx->bc, &alu);
1936			if (r)
1937				return r;
1938		}
1939	}
1940	return 0;
1941}
1942
1943/*
1944 * r600 - trunc to -PI..PI range
1945 * r700 - normalize by dividing by 2PI
1946 * see fdo bug 27901
1947 */
1948static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1949{
1950	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1951	static float double_pi = 3.1415926535 * 2;
1952	static float neg_pi = -3.1415926535;
1953
1954	int r;
1955	struct r600_bytecode_alu alu;
1956
1957	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1958	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1959	alu.is_op3 = 1;
1960
1961	alu.dst.chan = 0;
1962	alu.dst.sel = ctx->temp_reg;
1963	alu.dst.write = 1;
1964
1965	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1966
1967	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1968	alu.src[1].chan = 0;
1969	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1970	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1971	alu.src[2].chan = 0;
1972	alu.last = 1;
1973	r = r600_bytecode_add_alu(ctx->bc, &alu);
1974	if (r)
1975		return r;
1976
1977	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1978	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1979
1980	alu.dst.chan = 0;
1981	alu.dst.sel = ctx->temp_reg;
1982	alu.dst.write = 1;
1983
1984	alu.src[0].sel = ctx->temp_reg;
1985	alu.src[0].chan = 0;
1986	alu.last = 1;
1987	r = r600_bytecode_add_alu(ctx->bc, &alu);
1988	if (r)
1989		return r;
1990
1991	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1992	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1993	alu.is_op3 = 1;
1994
1995	alu.dst.chan = 0;
1996	alu.dst.sel = ctx->temp_reg;
1997	alu.dst.write = 1;
1998
1999	alu.src[0].sel = ctx->temp_reg;
2000	alu.src[0].chan = 0;
2001
2002	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2003	alu.src[1].chan = 0;
2004	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2005	alu.src[2].chan = 0;
2006
2007	if (ctx->bc->chip_class == R600) {
2008		alu.src[1].value = *(uint32_t *)&double_pi;
2009		alu.src[2].value = *(uint32_t *)&neg_pi;
2010	} else {
2011		alu.src[1].sel = V_SQ_ALU_SRC_1;
2012		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2013		alu.src[2].neg = 1;
2014	}
2015
2016	alu.last = 1;
2017	r = r600_bytecode_add_alu(ctx->bc, &alu);
2018	if (r)
2019		return r;
2020	return 0;
2021}
2022
2023static int cayman_trig(struct r600_shader_ctx *ctx)
2024{
2025	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2026	struct r600_bytecode_alu alu;
2027	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2028	int i, r;
2029
2030	r = tgsi_setup_trig(ctx);
2031	if (r)
2032		return r;
2033
2034
2035	for (i = 0; i < last_slot; i++) {
2036		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2037		alu.inst = ctx->inst_info->r600_opcode;
2038		alu.dst.chan = i;
2039
2040		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2041		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2042
2043		alu.src[0].sel = ctx->temp_reg;
2044		alu.src[0].chan = 0;
2045		if (i == last_slot - 1)
2046			alu.last = 1;
2047		r = r600_bytecode_add_alu(ctx->bc, &alu);
2048		if (r)
2049			return r;
2050	}
2051	return 0;
2052}
2053
2054static int tgsi_trig(struct r600_shader_ctx *ctx)
2055{
2056	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2057	struct r600_bytecode_alu alu;
2058	int i, r;
2059	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2060
2061	r = tgsi_setup_trig(ctx);
2062	if (r)
2063		return r;
2064
2065	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2066	alu.inst = ctx->inst_info->r600_opcode;
2067	alu.dst.chan = 0;
2068	alu.dst.sel = ctx->temp_reg;
2069	alu.dst.write = 1;
2070
2071	alu.src[0].sel = ctx->temp_reg;
2072	alu.src[0].chan = 0;
2073	alu.last = 1;
2074	r = r600_bytecode_add_alu(ctx->bc, &alu);
2075	if (r)
2076		return r;
2077
2078	/* replicate result */
2079	for (i = 0; i < lasti + 1; i++) {
2080		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2081			continue;
2082
2083		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2084		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2085
2086		alu.src[0].sel = ctx->temp_reg;
2087		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2088		if (i == lasti)
2089			alu.last = 1;
2090		r = r600_bytecode_add_alu(ctx->bc, &alu);
2091		if (r)
2092			return r;
2093	}
2094	return 0;
2095}
2096
2097static int tgsi_scs(struct r600_shader_ctx *ctx)
2098{
2099	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2100	struct r600_bytecode_alu alu;
2101	int i, r;
2102
2103	/* We'll only need the trig stuff if we are going to write to the
2104	 * X or Y components of the destination vector.
2105	 */
2106	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2107		r = tgsi_setup_trig(ctx);
2108		if (r)
2109			return r;
2110	}
2111
2112	/* dst.x = COS */
2113	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2114		if (ctx->bc->chip_class == CAYMAN) {
2115			for (i = 0 ; i < 3; i++) {
2116				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2117				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2118				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2119
2120				if (i == 0)
2121					alu.dst.write = 1;
2122				else
2123					alu.dst.write = 0;
2124				alu.src[0].sel = ctx->temp_reg;
2125				alu.src[0].chan = 0;
2126				if (i == 2)
2127					alu.last = 1;
2128				r = r600_bytecode_add_alu(ctx->bc, &alu);
2129				if (r)
2130					return r;
2131			}
2132		} else {
2133			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2134			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2135			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2136
2137			alu.src[0].sel = ctx->temp_reg;
2138			alu.src[0].chan = 0;
2139			alu.last = 1;
2140			r = r600_bytecode_add_alu(ctx->bc, &alu);
2141			if (r)
2142				return r;
2143		}
2144	}
2145
2146	/* dst.y = SIN */
2147	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2148		if (ctx->bc->chip_class == CAYMAN) {
2149			for (i = 0 ; i < 3; i++) {
2150				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2151				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2152				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2153				if (i == 1)
2154					alu.dst.write = 1;
2155				else
2156					alu.dst.write = 0;
2157				alu.src[0].sel = ctx->temp_reg;
2158				alu.src[0].chan = 0;
2159				if (i == 2)
2160					alu.last = 1;
2161				r = r600_bytecode_add_alu(ctx->bc, &alu);
2162				if (r)
2163					return r;
2164			}
2165		} else {
2166			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2167			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2168			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2169
2170			alu.src[0].sel = ctx->temp_reg;
2171			alu.src[0].chan = 0;
2172			alu.last = 1;
2173			r = r600_bytecode_add_alu(ctx->bc, &alu);
2174			if (r)
2175				return r;
2176		}
2177	}
2178
2179	/* dst.z = 0.0; */
2180	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2181		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2182
2183		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2184
2185		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2186
2187		alu.src[0].sel = V_SQ_ALU_SRC_0;
2188		alu.src[0].chan = 0;
2189
2190		alu.last = 1;
2191
2192		r = r600_bytecode_add_alu(ctx->bc, &alu);
2193		if (r)
2194			return r;
2195	}
2196
2197	/* dst.w = 1.0; */
2198	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2199		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2200
2201		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2202
2203		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2204
2205		alu.src[0].sel = V_SQ_ALU_SRC_1;
2206		alu.src[0].chan = 0;
2207
2208		alu.last = 1;
2209
2210		r = r600_bytecode_add_alu(ctx->bc, &alu);
2211		if (r)
2212			return r;
2213	}
2214
2215	return 0;
2216}
2217
2218static int tgsi_kill(struct r600_shader_ctx *ctx)
2219{
2220	struct r600_bytecode_alu alu;
2221	int i, r;
2222
2223	for (i = 0; i < 4; i++) {
2224		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2225		alu.inst = ctx->inst_info->r600_opcode;
2226
2227		alu.dst.chan = i;
2228
2229		alu.src[0].sel = V_SQ_ALU_SRC_0;
2230
2231		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2232			alu.src[1].sel = V_SQ_ALU_SRC_1;
2233			alu.src[1].neg = 1;
2234		} else {
2235			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2236		}
2237		if (i == 3) {
2238			alu.last = 1;
2239		}
2240		r = r600_bytecode_add_alu(ctx->bc, &alu);
2241		if (r)
2242			return r;
2243	}
2244
2245	/* kill must be last in ALU */
2246	ctx->bc->force_add_cf = 1;
2247	ctx->shader->uses_kill = TRUE;
2248	return 0;
2249}
2250
2251static int tgsi_lit(struct r600_shader_ctx *ctx)
2252{
2253	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2254	struct r600_bytecode_alu alu;
2255	int r;
2256
2257	/* tmp.x = max(src.y, 0.0) */
2258	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2259	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2260	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2261	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2262	alu.src[1].chan = 1;
2263
2264	alu.dst.sel = ctx->temp_reg;
2265	alu.dst.chan = 0;
2266	alu.dst.write = 1;
2267
2268	alu.last = 1;
2269	r = r600_bytecode_add_alu(ctx->bc, &alu);
2270	if (r)
2271		return r;
2272
2273	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2274	{
2275		int chan;
2276		int sel;
2277		int i;
2278
2279		if (ctx->bc->chip_class == CAYMAN) {
2280			for (i = 0; i < 3; i++) {
2281				/* tmp.z = log(tmp.x) */
2282				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2283				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2284				alu.src[0].sel = ctx->temp_reg;
2285				alu.src[0].chan = 0;
2286				alu.dst.sel = ctx->temp_reg;
2287				alu.dst.chan = i;
2288				if (i == 2) {
2289					alu.dst.write = 1;
2290					alu.last = 1;
2291				} else
2292					alu.dst.write = 0;
2293
2294				r = r600_bytecode_add_alu(ctx->bc, &alu);
2295				if (r)
2296					return r;
2297			}
2298		} else {
2299			/* tmp.z = log(tmp.x) */
2300			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2301			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2302			alu.src[0].sel = ctx->temp_reg;
2303			alu.src[0].chan = 0;
2304			alu.dst.sel = ctx->temp_reg;
2305			alu.dst.chan = 2;
2306			alu.dst.write = 1;
2307			alu.last = 1;
2308			r = r600_bytecode_add_alu(ctx->bc, &alu);
2309			if (r)
2310				return r;
2311		}
2312
2313		chan = alu.dst.chan;
2314		sel = alu.dst.sel;
2315
2316		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2317		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2318		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2319		alu.src[0].sel  = sel;
2320		alu.src[0].chan = chan;
2321		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2322		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2323		alu.dst.sel = ctx->temp_reg;
2324		alu.dst.chan = 0;
2325		alu.dst.write = 1;
2326		alu.is_op3 = 1;
2327		alu.last = 1;
2328		r = r600_bytecode_add_alu(ctx->bc, &alu);
2329		if (r)
2330			return r;
2331
2332		if (ctx->bc->chip_class == CAYMAN) {
2333			for (i = 0; i < 3; i++) {
2334				/* dst.z = exp(tmp.x) */
2335				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2336				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2337				alu.src[0].sel = ctx->temp_reg;
2338				alu.src[0].chan = 0;
2339				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2340				if (i == 2) {
2341					alu.dst.write = 1;
2342					alu.last = 1;
2343				} else
2344					alu.dst.write = 0;
2345				r = r600_bytecode_add_alu(ctx->bc, &alu);
2346				if (r)
2347					return r;
2348			}
2349		} else {
2350			/* dst.z = exp(tmp.x) */
2351			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2352			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2353			alu.src[0].sel = ctx->temp_reg;
2354			alu.src[0].chan = 0;
2355			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2356			alu.last = 1;
2357			r = r600_bytecode_add_alu(ctx->bc, &alu);
2358			if (r)
2359				return r;
2360		}
2361	}
2362
2363	/* dst.x, <- 1.0  */
2364	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2365	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2366	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2367	alu.src[0].chan = 0;
2368	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2369	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2370	r = r600_bytecode_add_alu(ctx->bc, &alu);
2371	if (r)
2372		return r;
2373
2374	/* dst.y = max(src.x, 0.0) */
2375	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2376	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2377	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2378	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2379	alu.src[1].chan = 0;
2380	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2381	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2382	r = r600_bytecode_add_alu(ctx->bc, &alu);
2383	if (r)
2384		return r;
2385
2386	/* dst.w, <- 1.0  */
2387	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2388	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2389	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2390	alu.src[0].chan = 0;
2391	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2392	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2393	alu.last = 1;
2394	r = r600_bytecode_add_alu(ctx->bc, &alu);
2395	if (r)
2396		return r;
2397
2398	return 0;
2399}
2400
2401static int tgsi_rsq(struct r600_shader_ctx *ctx)
2402{
2403	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2404	struct r600_bytecode_alu alu;
2405	int i, r;
2406
2407	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2408
2409	/* XXX:
2410	 * For state trackers other than OpenGL, we'll want to use
2411	 * _RECIPSQRT_IEEE instead.
2412	 */
2413	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2414
2415	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2416		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2417		r600_bytecode_src_set_abs(&alu.src[i]);
2418	}
2419	alu.dst.sel = ctx->temp_reg;
2420	alu.dst.write = 1;
2421	alu.last = 1;
2422	r = r600_bytecode_add_alu(ctx->bc, &alu);
2423	if (r)
2424		return r;
2425	/* replicate result */
2426	return tgsi_helper_tempx_replicate(ctx);
2427}
2428
2429static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2430{
2431	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2432	struct r600_bytecode_alu alu;
2433	int i, r;
2434
2435	for (i = 0; i < 4; i++) {
2436		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2437		alu.src[0].sel = ctx->temp_reg;
2438		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2439		alu.dst.chan = i;
2440		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2441		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2442		if (i == 3)
2443			alu.last = 1;
2444		r = r600_bytecode_add_alu(ctx->bc, &alu);
2445		if (r)
2446			return r;
2447	}
2448	return 0;
2449}
2450
2451static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2452{
2453	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2454	struct r600_bytecode_alu alu;
2455	int i, r;
2456
2457	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2458	alu.inst = ctx->inst_info->r600_opcode;
2459	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2460		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2461	}
2462	alu.dst.sel = ctx->temp_reg;
2463	alu.dst.write = 1;
2464	alu.last = 1;
2465	r = r600_bytecode_add_alu(ctx->bc, &alu);
2466	if (r)
2467		return r;
2468	/* replicate result */
2469	return tgsi_helper_tempx_replicate(ctx);
2470}
2471
2472static int cayman_pow(struct r600_shader_ctx *ctx)
2473{
2474	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2475	int i, r;
2476	struct r600_bytecode_alu alu;
2477	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2478
2479	for (i = 0; i < 3; i++) {
2480		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2481		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2482		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2483		alu.dst.sel = ctx->temp_reg;
2484		alu.dst.chan = i;
2485		alu.dst.write = 1;
2486		if (i == 2)
2487			alu.last = 1;
2488		r = r600_bytecode_add_alu(ctx->bc, &alu);
2489		if (r)
2490			return r;
2491	}
2492
2493	/* b * LOG2(a) */
2494	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2495	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2496	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2497	alu.src[1].sel = ctx->temp_reg;
2498	alu.dst.sel = ctx->temp_reg;
2499	alu.dst.write = 1;
2500	alu.last = 1;
2501	r = r600_bytecode_add_alu(ctx->bc, &alu);
2502	if (r)
2503		return r;
2504
2505	for (i = 0; i < last_slot; i++) {
2506		/* POW(a,b) = EXP2(b * LOG2(a))*/
2507		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2508		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2509		alu.src[0].sel = ctx->temp_reg;
2510
2511		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2512		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2513		if (i == last_slot - 1)
2514			alu.last = 1;
2515		r = r600_bytecode_add_alu(ctx->bc, &alu);
2516		if (r)
2517			return r;
2518	}
2519	return 0;
2520}
2521
2522static int tgsi_pow(struct r600_shader_ctx *ctx)
2523{
2524	struct r600_bytecode_alu alu;
2525	int r;
2526
2527	/* LOG2(a) */
2528	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2529	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2530	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2531	alu.dst.sel = ctx->temp_reg;
2532	alu.dst.write = 1;
2533	alu.last = 1;
2534	r = r600_bytecode_add_alu(ctx->bc, &alu);
2535	if (r)
2536		return r;
2537	/* b * LOG2(a) */
2538	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2539	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2540	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2541	alu.src[1].sel = ctx->temp_reg;
2542	alu.dst.sel = ctx->temp_reg;
2543	alu.dst.write = 1;
2544	alu.last = 1;
2545	r = r600_bytecode_add_alu(ctx->bc, &alu);
2546	if (r)
2547		return r;
2548	/* POW(a,b) = EXP2(b * LOG2(a))*/
2549	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2550	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2551	alu.src[0].sel = ctx->temp_reg;
2552	alu.dst.sel = ctx->temp_reg;
2553	alu.dst.write = 1;
2554	alu.last = 1;
2555	r = r600_bytecode_add_alu(ctx->bc, &alu);
2556	if (r)
2557		return r;
2558	return tgsi_helper_tempx_replicate(ctx);
2559}
2560
2561static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2562{
2563	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2564	struct r600_bytecode_alu alu;
2565	int i, r, j;
2566	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2567	int tmp0 = ctx->temp_reg;
2568	int tmp1 = r600_get_temp(ctx);
2569	int tmp2 = r600_get_temp(ctx);
2570	int tmp3 = r600_get_temp(ctx);
2571	/* Unsigned path:
2572	 *
2573	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2574	 *
2575	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2576	 * 2. tmp0.z = lo (tmp0.x * src2)
2577	 * 3. tmp0.w = -tmp0.z
2578	 * 4. tmp0.y = hi (tmp0.x * src2)
2579	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2580	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2581	 * 7. tmp1.x = tmp0.x - tmp0.w
2582	 * 8. tmp1.y = tmp0.x + tmp0.w
2583	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2584	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2585	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2586	 *
2587	 * 12. tmp0.w = src1 - tmp0.y       = r
2588	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2589	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2590	 *
2591	 * if DIV
2592	 *
2593	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2594	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2595	 *
2596	 * else MOD
2597	 *
2598	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2599	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2600	 *
2601	 * endif
2602	 *
2603	 * 17. tmp1.x = tmp1.x & tmp1.y
2604	 *
2605	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2606	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2607	 *
2608	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2609	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2610	 *
2611	 * Signed path:
2612	 *
2613	 * Same as unsigned, using abs values of the operands,
2614	 * and fixing the sign of the result in the end.
2615	 */
2616
2617	for (i = 0; i < 4; i++) {
2618		if (!(write_mask & (1<<i)))
2619			continue;
2620
2621		if (signed_op) {
2622
2623			/* tmp2.x = -src0 */
2624			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2625			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2626
2627			alu.dst.sel = tmp2;
2628			alu.dst.chan = 0;
2629			alu.dst.write = 1;
2630
2631			alu.src[0].sel = V_SQ_ALU_SRC_0;
2632
2633			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2634
2635			alu.last = 1;
2636			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2637				return r;
2638
2639			/* tmp2.y = -src1 */
2640			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2641			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2642
2643			alu.dst.sel = tmp2;
2644			alu.dst.chan = 1;
2645			alu.dst.write = 1;
2646
2647			alu.src[0].sel = V_SQ_ALU_SRC_0;
2648
2649			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2650
2651			alu.last = 1;
2652			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2653				return r;
2654
2655			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2656			/* it will be a sign of the quotient */
2657			if (!mod) {
2658
2659				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2660				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2661
2662				alu.dst.sel = tmp2;
2663				alu.dst.chan = 2;
2664				alu.dst.write = 1;
2665
2666				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2667				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2668
2669				alu.last = 1;
2670				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2671					return r;
2672			}
2673
2674			/* tmp2.x = |src0| */
2675			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2676			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2677			alu.is_op3 = 1;
2678
2679			alu.dst.sel = tmp2;
2680			alu.dst.chan = 0;
2681			alu.dst.write = 1;
2682
2683			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2684			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2685			alu.src[2].sel = tmp2;
2686			alu.src[2].chan = 0;
2687
2688			alu.last = 1;
2689			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2690				return r;
2691
2692			/* tmp2.y = |src1| */
2693			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2694			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2695			alu.is_op3 = 1;
2696
2697			alu.dst.sel = tmp2;
2698			alu.dst.chan = 1;
2699			alu.dst.write = 1;
2700
2701			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2702			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2703			alu.src[2].sel = tmp2;
2704			alu.src[2].chan = 1;
2705
2706			alu.last = 1;
2707			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2708				return r;
2709
2710		}
2711
2712		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2713		if (ctx->bc->chip_class == CAYMAN) {
2714			/* tmp3.x = u2f(src2) */
2715			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2716			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2717
2718			alu.dst.sel = tmp3;
2719			alu.dst.chan = 0;
2720			alu.dst.write = 1;
2721
2722			if (signed_op) {
2723				alu.src[0].sel = tmp2;
2724				alu.src[0].chan = 1;
2725			} else {
2726				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2727			}
2728
2729			alu.last = 1;
2730			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2731				return r;
2732
2733			/* tmp0.x = recip(tmp3.x) */
2734			for (j = 0 ; j < 3; j++) {
2735				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2736				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2737
2738				alu.dst.sel = tmp0;
2739				alu.dst.chan = j;
2740				alu.dst.write = (j == 0);
2741
2742				alu.src[0].sel = tmp3;
2743				alu.src[0].chan = 0;
2744
2745				if (j == 2)
2746					alu.last = 1;
2747				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2748					return r;
2749			}
2750
2751			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2752			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2753
2754			alu.src[0].sel = tmp0;
2755			alu.src[0].chan = 0;
2756
2757			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2758			alu.src[1].value = 0x4f800000;
2759
2760			alu.dst.sel = tmp3;
2761			alu.dst.write = 1;
2762			alu.last = 1;
2763			r = r600_bytecode_add_alu(ctx->bc, &alu);
2764			if (r)
2765				return r;
2766
2767			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2768			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2769
2770			alu.dst.sel = tmp0;
2771			alu.dst.chan = 0;
2772			alu.dst.write = 1;
2773
2774			alu.src[0].sel = tmp3;
2775			alu.src[0].chan = 0;
2776
2777			alu.last = 1;
2778			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2779				return r;
2780
2781		} else {
2782			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2783			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2784
2785			alu.dst.sel = tmp0;
2786			alu.dst.chan = 0;
2787			alu.dst.write = 1;
2788
2789			if (signed_op) {
2790				alu.src[0].sel = tmp2;
2791				alu.src[0].chan = 1;
2792			} else {
2793				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2794			}
2795
2796			alu.last = 1;
2797			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2798				return r;
2799		}
2800
2801		/* 2. tmp0.z = lo (tmp0.x * src2) */
2802		if (ctx->bc->chip_class == CAYMAN) {
2803			for (j = 0 ; j < 4; j++) {
2804				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2805				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2806
2807				alu.dst.sel = tmp0;
2808				alu.dst.chan = j;
2809				alu.dst.write = (j == 2);
2810
2811				alu.src[0].sel = tmp0;
2812				alu.src[0].chan = 0;
2813				if (signed_op) {
2814					alu.src[1].sel = tmp2;
2815					alu.src[1].chan = 1;
2816				} else {
2817					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2818				}
2819
2820				alu.last = (j == 3);
2821				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2822					return r;
2823			}
2824		} else {
2825			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2826			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2827
2828			alu.dst.sel = tmp0;
2829			alu.dst.chan = 2;
2830			alu.dst.write = 1;
2831
2832			alu.src[0].sel = tmp0;
2833			alu.src[0].chan = 0;
2834			if (signed_op) {
2835				alu.src[1].sel = tmp2;
2836				alu.src[1].chan = 1;
2837			} else {
2838				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2839			}
2840
2841			alu.last = 1;
2842			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2843				return r;
2844		}
2845
2846		/* 3. tmp0.w = -tmp0.z */
2847		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2848		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2849
2850		alu.dst.sel = tmp0;
2851		alu.dst.chan = 3;
2852		alu.dst.write = 1;
2853
2854		alu.src[0].sel = V_SQ_ALU_SRC_0;
2855		alu.src[1].sel = tmp0;
2856		alu.src[1].chan = 2;
2857
2858		alu.last = 1;
2859		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2860			return r;
2861
2862		/* 4. tmp0.y = hi (tmp0.x * src2) */
2863		if (ctx->bc->chip_class == CAYMAN) {
2864			for (j = 0 ; j < 4; j++) {
2865				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2866				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2867
2868				alu.dst.sel = tmp0;
2869				alu.dst.chan = j;
2870				alu.dst.write = (j == 1);
2871
2872				alu.src[0].sel = tmp0;
2873				alu.src[0].chan = 0;
2874
2875				if (signed_op) {
2876					alu.src[1].sel = tmp2;
2877					alu.src[1].chan = 1;
2878				} else {
2879					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2880				}
2881				alu.last = (j == 3);
2882				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2883					return r;
2884			}
2885		} else {
2886			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2887			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2888
2889			alu.dst.sel = tmp0;
2890			alu.dst.chan = 1;
2891			alu.dst.write = 1;
2892
2893			alu.src[0].sel = tmp0;
2894			alu.src[0].chan = 0;
2895
2896			if (signed_op) {
2897				alu.src[1].sel = tmp2;
2898				alu.src[1].chan = 1;
2899			} else {
2900				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2901			}
2902
2903			alu.last = 1;
2904			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2905				return r;
2906		}
2907
2908		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2909		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2910		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2911		alu.is_op3 = 1;
2912
2913		alu.dst.sel = tmp0;
2914		alu.dst.chan = 2;
2915		alu.dst.write = 1;
2916
2917		alu.src[0].sel = tmp0;
2918		alu.src[0].chan = 1;
2919		alu.src[1].sel = tmp0;
2920		alu.src[1].chan = 3;
2921		alu.src[2].sel = tmp0;
2922		alu.src[2].chan = 2;
2923
2924		alu.last = 1;
2925		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2926			return r;
2927
2928		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2929		if (ctx->bc->chip_class == CAYMAN) {
2930			for (j = 0 ; j < 4; j++) {
2931				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2932				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2933
2934				alu.dst.sel = tmp0;
2935				alu.dst.chan = j;
2936				alu.dst.write = (j == 3);
2937
2938				alu.src[0].sel = tmp0;
2939				alu.src[0].chan = 2;
2940
2941				alu.src[1].sel = tmp0;
2942				alu.src[1].chan = 0;
2943
2944				alu.last = (j == 3);
2945				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2946					return r;
2947			}
2948		} else {
2949			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2950			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2951
2952			alu.dst.sel = tmp0;
2953			alu.dst.chan = 3;
2954			alu.dst.write = 1;
2955
2956			alu.src[0].sel = tmp0;
2957			alu.src[0].chan = 2;
2958
2959			alu.src[1].sel = tmp0;
2960			alu.src[1].chan = 0;
2961
2962			alu.last = 1;
2963			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2964				return r;
2965		}
2966
2967		/* 7. tmp1.x = tmp0.x - tmp0.w */
2968		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2969		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2970
2971		alu.dst.sel = tmp1;
2972		alu.dst.chan = 0;
2973		alu.dst.write = 1;
2974
2975		alu.src[0].sel = tmp0;
2976		alu.src[0].chan = 0;
2977		alu.src[1].sel = tmp0;
2978		alu.src[1].chan = 3;
2979
2980		alu.last = 1;
2981		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2982			return r;
2983
2984		/* 8. tmp1.y = tmp0.x + tmp0.w */
2985		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2986		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
2987
2988		alu.dst.sel = tmp1;
2989		alu.dst.chan = 1;
2990		alu.dst.write = 1;
2991
2992		alu.src[0].sel = tmp0;
2993		alu.src[0].chan = 0;
2994		alu.src[1].sel = tmp0;
2995		alu.src[1].chan = 3;
2996
2997		alu.last = 1;
2998		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2999			return r;
3000
3001		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3002		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3003		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3004		alu.is_op3 = 1;
3005
3006		alu.dst.sel = tmp0;
3007		alu.dst.chan = 0;
3008		alu.dst.write = 1;
3009
3010		alu.src[0].sel = tmp0;
3011		alu.src[0].chan = 1;
3012		alu.src[1].sel = tmp1;
3013		alu.src[1].chan = 1;
3014		alu.src[2].sel = tmp1;
3015		alu.src[2].chan = 0;
3016
3017		alu.last = 1;
3018		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3019			return r;
3020
3021		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3022		if (ctx->bc->chip_class == CAYMAN) {
3023			for (j = 0 ; j < 4; j++) {
3024				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3025				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3026
3027				alu.dst.sel = tmp0;
3028				alu.dst.chan = j;
3029				alu.dst.write = (j == 2);
3030
3031				alu.src[0].sel = tmp0;
3032				alu.src[0].chan = 0;
3033
3034				if (signed_op) {
3035					alu.src[1].sel = tmp2;
3036					alu.src[1].chan = 0;
3037				} else {
3038					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3039				}
3040
3041				alu.last = (j == 3);
3042				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3043					return r;
3044			}
3045		} else {
3046			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3047			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3048
3049			alu.dst.sel = tmp0;
3050			alu.dst.chan = 2;
3051			alu.dst.write = 1;
3052
3053			alu.src[0].sel = tmp0;
3054			alu.src[0].chan = 0;
3055
3056			if (signed_op) {
3057				alu.src[1].sel = tmp2;
3058				alu.src[1].chan = 0;
3059			} else {
3060				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3061			}
3062
3063			alu.last = 1;
3064			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3065				return r;
3066		}
3067
3068		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3069		if (ctx->bc->chip_class == CAYMAN) {
3070			for (j = 0 ; j < 4; j++) {
3071				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3072				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3073
3074				alu.dst.sel = tmp0;
3075				alu.dst.chan = j;
3076				alu.dst.write = (j == 1);
3077
3078				if (signed_op) {
3079					alu.src[0].sel = tmp2;
3080					alu.src[0].chan = 1;
3081				} else {
3082					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3083				}
3084
3085				alu.src[1].sel = tmp0;
3086				alu.src[1].chan = 2;
3087
3088				alu.last = (j == 3);
3089				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3090					return r;
3091			}
3092		} else {
3093			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3094			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3095
3096			alu.dst.sel = tmp0;
3097			alu.dst.chan = 1;
3098			alu.dst.write = 1;
3099
3100			if (signed_op) {
3101				alu.src[0].sel = tmp2;
3102				alu.src[0].chan = 1;
3103			} else {
3104				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3105			}
3106
3107			alu.src[1].sel = tmp0;
3108			alu.src[1].chan = 2;
3109
3110			alu.last = 1;
3111			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3112				return r;
3113		}
3114
3115		/* 12. tmp0.w = src1 - tmp0.y       = r */
3116		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3117		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3118
3119		alu.dst.sel = tmp0;
3120		alu.dst.chan = 3;
3121		alu.dst.write = 1;
3122
3123		if (signed_op) {
3124			alu.src[0].sel = tmp2;
3125			alu.src[0].chan = 0;
3126		} else {
3127			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3128		}
3129
3130		alu.src[1].sel = tmp0;
3131		alu.src[1].chan = 1;
3132
3133		alu.last = 1;
3134		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3135			return r;
3136
3137		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3138		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3139		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3140
3141		alu.dst.sel = tmp1;
3142		alu.dst.chan = 0;
3143		alu.dst.write = 1;
3144
3145		alu.src[0].sel = tmp0;
3146		alu.src[0].chan = 3;
3147		if (signed_op) {
3148			alu.src[1].sel = tmp2;
3149			alu.src[1].chan = 1;
3150		} else {
3151			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3152		}
3153
3154		alu.last = 1;
3155		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3156			return r;
3157
3158		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3159		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3160		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3161
3162		alu.dst.sel = tmp1;
3163		alu.dst.chan = 1;
3164		alu.dst.write = 1;
3165
3166		if (signed_op) {
3167			alu.src[0].sel = tmp2;
3168			alu.src[0].chan = 0;
3169		} else {
3170			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3171		}
3172
3173		alu.src[1].sel = tmp0;
3174		alu.src[1].chan = 1;
3175
3176		alu.last = 1;
3177		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3178			return r;
3179
3180		if (mod) { /* UMOD */
3181
3182			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3183			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3184			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3185
3186			alu.dst.sel = tmp1;
3187			alu.dst.chan = 2;
3188			alu.dst.write = 1;
3189
3190			alu.src[0].sel = tmp0;
3191			alu.src[0].chan = 3;
3192
3193			if (signed_op) {
3194				alu.src[1].sel = tmp2;
3195				alu.src[1].chan = 1;
3196			} else {
3197				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3198			}
3199
3200			alu.last = 1;
3201			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3202				return r;
3203
3204			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3205			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3206			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3207
3208			alu.dst.sel = tmp1;
3209			alu.dst.chan = 3;
3210			alu.dst.write = 1;
3211
3212			alu.src[0].sel = tmp0;
3213			alu.src[0].chan = 3;
3214			if (signed_op) {
3215				alu.src[1].sel = tmp2;
3216				alu.src[1].chan = 1;
3217			} else {
3218				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3219			}
3220
3221			alu.last = 1;
3222			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3223				return r;
3224
3225		} else { /* UDIV */
3226
3227			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3228			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3229			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3230
3231			alu.dst.sel = tmp1;
3232			alu.dst.chan = 2;
3233			alu.dst.write = 1;
3234
3235			alu.src[0].sel = tmp0;
3236			alu.src[0].chan = 2;
3237			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3238
3239			alu.last = 1;
3240			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3241				return r;
3242
3243			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3244			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3245			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3246
3247			alu.dst.sel = tmp1;
3248			alu.dst.chan = 3;
3249			alu.dst.write = 1;
3250
3251			alu.src[0].sel = tmp0;
3252			alu.src[0].chan = 2;
3253			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3254
3255			alu.last = 1;
3256			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3257				return r;
3258
3259		}
3260
3261		/* 17. tmp1.x = tmp1.x & tmp1.y */
3262		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3263		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3264
3265		alu.dst.sel = tmp1;
3266		alu.dst.chan = 0;
3267		alu.dst.write = 1;
3268
3269		alu.src[0].sel = tmp1;
3270		alu.src[0].chan = 0;
3271		alu.src[1].sel = tmp1;
3272		alu.src[1].chan = 1;
3273
3274		alu.last = 1;
3275		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3276			return r;
3277
3278		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3279		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3280		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3281		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3282		alu.is_op3 = 1;
3283
3284		alu.dst.sel = tmp0;
3285		alu.dst.chan = 2;
3286		alu.dst.write = 1;
3287
3288		alu.src[0].sel = tmp1;
3289		alu.src[0].chan = 0;
3290		alu.src[1].sel = tmp0;
3291		alu.src[1].chan = mod ? 3 : 2;
3292		alu.src[2].sel = tmp1;
3293		alu.src[2].chan = 2;
3294
3295		alu.last = 1;
3296		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3297			return r;
3298
3299		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3300		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3301		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3302		alu.is_op3 = 1;
3303
3304		if (signed_op) {
3305			alu.dst.sel = tmp0;
3306			alu.dst.chan = 2;
3307			alu.dst.write = 1;
3308		} else {
3309			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3310		}
3311
3312		alu.src[0].sel = tmp1;
3313		alu.src[0].chan = 1;
3314		alu.src[1].sel = tmp1;
3315		alu.src[1].chan = 3;
3316		alu.src[2].sel = tmp0;
3317		alu.src[2].chan = 2;
3318
3319		alu.last = 1;
3320		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3321			return r;
3322
3323		if (signed_op) {
3324
3325			/* fix the sign of the result */
3326
3327			if (mod) {
3328
3329				/* tmp0.x = -tmp0.z */
3330				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3331				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3332
3333				alu.dst.sel = tmp0;
3334				alu.dst.chan = 0;
3335				alu.dst.write = 1;
3336
3337				alu.src[0].sel = V_SQ_ALU_SRC_0;
3338				alu.src[1].sel = tmp0;
3339				alu.src[1].chan = 2;
3340
3341				alu.last = 1;
3342				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3343					return r;
3344
3345				/* sign of the remainder is the same as the sign of src0 */
3346				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3347				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3348				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3349				alu.is_op3 = 1;
3350
3351				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3352
3353				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3354				alu.src[1].sel = tmp0;
3355				alu.src[1].chan = 2;
3356				alu.src[2].sel = tmp0;
3357				alu.src[2].chan = 0;
3358
3359				alu.last = 1;
3360				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3361					return r;
3362
3363			} else {
3364
3365				/* tmp0.x = -tmp0.z */
3366				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3367				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3368
3369				alu.dst.sel = tmp0;
3370				alu.dst.chan = 0;
3371				alu.dst.write = 1;
3372
3373				alu.src[0].sel = V_SQ_ALU_SRC_0;
3374				alu.src[1].sel = tmp0;
3375				alu.src[1].chan = 2;
3376
3377				alu.last = 1;
3378				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3379					return r;
3380
3381				/* fix the quotient sign (same as the sign of src0*src1) */
3382				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3383				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3384				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3385				alu.is_op3 = 1;
3386
3387				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3388
3389				alu.src[0].sel = tmp2;
3390				alu.src[0].chan = 2;
3391				alu.src[1].sel = tmp0;
3392				alu.src[1].chan = 2;
3393				alu.src[2].sel = tmp0;
3394				alu.src[2].chan = 0;
3395
3396				alu.last = 1;
3397				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3398					return r;
3399			}
3400		}
3401	}
3402	return 0;
3403}
3404
3405static int tgsi_udiv(struct r600_shader_ctx *ctx)
3406{
3407	return tgsi_divmod(ctx, 0, 0);
3408}
3409
3410static int tgsi_umod(struct r600_shader_ctx *ctx)
3411{
3412	return tgsi_divmod(ctx, 1, 0);
3413}
3414
3415static int tgsi_idiv(struct r600_shader_ctx *ctx)
3416{
3417	return tgsi_divmod(ctx, 0, 1);
3418}
3419
3420static int tgsi_imod(struct r600_shader_ctx *ctx)
3421{
3422	return tgsi_divmod(ctx, 1, 1);
3423}
3424
3425
3426static int tgsi_f2i(struct r600_shader_ctx *ctx)
3427{
3428	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3429	struct r600_bytecode_alu alu;
3430	int i, r;
3431	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3432	int last_inst = tgsi_last_instruction(write_mask);
3433
3434	for (i = 0; i < 4; i++) {
3435		if (!(write_mask & (1<<i)))
3436			continue;
3437
3438		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3439		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3440
3441		alu.dst.sel = ctx->temp_reg;
3442		alu.dst.chan = i;
3443		alu.dst.write = 1;
3444
3445		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3446		if (i == last_inst)
3447			alu.last = 1;
3448		r = r600_bytecode_add_alu(ctx->bc, &alu);
3449		if (r)
3450			return r;
3451	}
3452
3453	for (i = 0; i < 4; i++) {
3454		if (!(write_mask & (1<<i)))
3455			continue;
3456
3457		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3458		alu.inst = ctx->inst_info->r600_opcode;
3459
3460		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3461
3462		alu.src[0].sel = ctx->temp_reg;
3463		alu.src[0].chan = i;
3464
3465		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3466			alu.last = 1;
3467		r = r600_bytecode_add_alu(ctx->bc, &alu);
3468		if (r)
3469			return r;
3470	}
3471
3472	return 0;
3473}
3474
3475static int tgsi_iabs(struct r600_shader_ctx *ctx)
3476{
3477	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3478	struct r600_bytecode_alu alu;
3479	int i, r;
3480	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3481	int last_inst = tgsi_last_instruction(write_mask);
3482
3483	/* tmp = -src */
3484	for (i = 0; i < 4; i++) {
3485		if (!(write_mask & (1<<i)))
3486			continue;
3487
3488		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3489		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3490
3491		alu.dst.sel = ctx->temp_reg;
3492		alu.dst.chan = i;
3493		alu.dst.write = 1;
3494
3495		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3496		alu.src[0].sel = V_SQ_ALU_SRC_0;
3497
3498		if (i == last_inst)
3499			alu.last = 1;
3500		r = r600_bytecode_add_alu(ctx->bc, &alu);
3501		if (r)
3502			return r;
3503	}
3504
3505	/* dst = (src >= 0 ? src : tmp) */
3506	for (i = 0; i < 4; i++) {
3507		if (!(write_mask & (1<<i)))
3508			continue;
3509
3510		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3511		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3512		alu.is_op3 = 1;
3513		alu.dst.write = 1;
3514
3515		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3516
3517		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3518		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3519		alu.src[2].sel = ctx->temp_reg;
3520		alu.src[2].chan = i;
3521
3522		if (i == last_inst)
3523			alu.last = 1;
3524		r = r600_bytecode_add_alu(ctx->bc, &alu);
3525		if (r)
3526			return r;
3527	}
3528	return 0;
3529}
3530
3531static int tgsi_issg(struct r600_shader_ctx *ctx)
3532{
3533	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3534	struct r600_bytecode_alu alu;
3535	int i, r;
3536	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3537	int last_inst = tgsi_last_instruction(write_mask);
3538
3539	/* tmp = (src >= 0 ? src : -1) */
3540	for (i = 0; i < 4; i++) {
3541		if (!(write_mask & (1<<i)))
3542			continue;
3543
3544		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3545		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3546		alu.is_op3 = 1;
3547
3548		alu.dst.sel = ctx->temp_reg;
3549		alu.dst.chan = i;
3550		alu.dst.write = 1;
3551
3552		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3553		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3554		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3555
3556		if (i == last_inst)
3557			alu.last = 1;
3558		r = r600_bytecode_add_alu(ctx->bc, &alu);
3559		if (r)
3560			return r;
3561	}
3562
3563	/* dst = (tmp > 0 ? 1 : tmp) */
3564	for (i = 0; i < 4; i++) {
3565		if (!(write_mask & (1<<i)))
3566			continue;
3567
3568		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3569		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3570		alu.is_op3 = 1;
3571		alu.dst.write = 1;
3572
3573		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3574
3575		alu.src[0].sel = ctx->temp_reg;
3576		alu.src[0].chan = i;
3577
3578		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3579
3580		alu.src[2].sel = ctx->temp_reg;
3581		alu.src[2].chan = i;
3582
3583		if (i == last_inst)
3584			alu.last = 1;
3585		r = r600_bytecode_add_alu(ctx->bc, &alu);
3586		if (r)
3587			return r;
3588	}
3589	return 0;
3590}
3591
3592
3593
3594static int tgsi_ssg(struct r600_shader_ctx *ctx)
3595{
3596	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3597	struct r600_bytecode_alu alu;
3598	int i, r;
3599
3600	/* tmp = (src > 0 ? 1 : src) */
3601	for (i = 0; i < 4; i++) {
3602		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3603		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3604		alu.is_op3 = 1;
3605
3606		alu.dst.sel = ctx->temp_reg;
3607		alu.dst.chan = i;
3608
3609		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3610		alu.src[1].sel = V_SQ_ALU_SRC_1;
3611		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3612
3613		if (i == 3)
3614			alu.last = 1;
3615		r = r600_bytecode_add_alu(ctx->bc, &alu);
3616		if (r)
3617			return r;
3618	}
3619
3620	/* dst = (-tmp > 0 ? -1 : tmp) */
3621	for (i = 0; i < 4; i++) {
3622		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3623		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3624		alu.is_op3 = 1;
3625		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3626
3627		alu.src[0].sel = ctx->temp_reg;
3628		alu.src[0].chan = i;
3629		alu.src[0].neg = 1;
3630
3631		alu.src[1].sel = V_SQ_ALU_SRC_1;
3632		alu.src[1].neg = 1;
3633
3634		alu.src[2].sel = ctx->temp_reg;
3635		alu.src[2].chan = i;
3636
3637		if (i == 3)
3638			alu.last = 1;
3639		r = r600_bytecode_add_alu(ctx->bc, &alu);
3640		if (r)
3641			return r;
3642	}
3643	return 0;
3644}
3645
3646static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3647{
3648	struct r600_bytecode_alu alu;
3649	int i, r;
3650
3651	for (i = 0; i < 4; i++) {
3652		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3653		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3654			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3655			alu.dst.chan = i;
3656		} else {
3657			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3658			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3659			alu.src[0].sel = ctx->temp_reg;
3660			alu.src[0].chan = i;
3661		}
3662		if (i == 3) {
3663			alu.last = 1;
3664		}
3665		r = r600_bytecode_add_alu(ctx->bc, &alu);
3666		if (r)
3667			return r;
3668	}
3669	return 0;
3670}
3671
3672static int tgsi_op3(struct r600_shader_ctx *ctx)
3673{
3674	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3675	struct r600_bytecode_alu alu;
3676	int i, j, r;
3677	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3678
3679	for (i = 0; i < lasti + 1; i++) {
3680		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3681			continue;
3682
3683		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3684		alu.inst = ctx->inst_info->r600_opcode;
3685		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3686			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3687		}
3688
3689		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3690		alu.dst.chan = i;
3691		alu.dst.write = 1;
3692		alu.is_op3 = 1;
3693		if (i == lasti) {
3694			alu.last = 1;
3695		}
3696		r = r600_bytecode_add_alu(ctx->bc, &alu);
3697		if (r)
3698			return r;
3699	}
3700	return 0;
3701}
3702
3703static int tgsi_dp(struct r600_shader_ctx *ctx)
3704{
3705	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3706	struct r600_bytecode_alu alu;
3707	int i, j, r;
3708
3709	for (i = 0; i < 4; i++) {
3710		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3711		alu.inst = ctx->inst_info->r600_opcode;
3712		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3713			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3714		}
3715
3716		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3717		alu.dst.chan = i;
3718		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3719		/* handle some special cases */
3720		switch (ctx->inst_info->tgsi_opcode) {
3721		case TGSI_OPCODE_DP2:
3722			if (i > 1) {
3723				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3724				alu.src[0].chan = alu.src[1].chan = 0;
3725			}
3726			break;
3727		case TGSI_OPCODE_DP3:
3728			if (i > 2) {
3729				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3730				alu.src[0].chan = alu.src[1].chan = 0;
3731			}
3732			break;
3733		case TGSI_OPCODE_DPH:
3734			if (i == 3) {
3735				alu.src[0].sel = V_SQ_ALU_SRC_1;
3736				alu.src[0].chan = 0;
3737				alu.src[0].neg = 0;
3738			}
3739			break;
3740		default:
3741			break;
3742		}
3743		if (i == 3) {
3744			alu.last = 1;
3745		}
3746		r = r600_bytecode_add_alu(ctx->bc, &alu);
3747		if (r)
3748			return r;
3749	}
3750	return 0;
3751}
3752
3753static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3754						    unsigned index)
3755{
3756	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3757	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3758		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3759		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3760		ctx->src[index].neg || ctx->src[index].abs;
3761}
3762
3763static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3764					unsigned index)
3765{
3766	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3767	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3768}
3769
3770static int tgsi_tex(struct r600_shader_ctx *ctx)
3771{
3772	static float one_point_five = 1.5f;
3773	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3774	struct r600_bytecode_tex tex;
3775	struct r600_bytecode_alu alu;
3776	unsigned src_gpr;
3777	int r, i, j;
3778	int opcode;
3779	/* Texture fetch instructions can only use gprs as source.
3780	 * Also they cannot negate the source or take the absolute value */
3781	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3782                                             tgsi_tex_src_requires_loading(ctx, 0);
3783	boolean src_loaded = FALSE;
3784	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3785	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3786
3787	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3788
3789	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3790		/* get offset values */
3791		if (inst->Texture.NumOffsets) {
3792			assert(inst->Texture.NumOffsets == 1);
3793
3794			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3795			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3796			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3797		}
3798	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3799		/* TGSI moves the sampler to src reg 3 for TXD */
3800		sampler_src_reg = 3;
3801
3802		for (i = 1; i < 3; i++) {
3803			/* set gradients h/v */
3804			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3805			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3806				SQ_TEX_INST_SET_GRADIENTS_V;
3807			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3808			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3809
3810			if (tgsi_tex_src_requires_loading(ctx, i)) {
3811				tex.src_gpr = r600_get_temp(ctx);
3812				tex.src_sel_x = 0;
3813				tex.src_sel_y = 1;
3814				tex.src_sel_z = 2;
3815				tex.src_sel_w = 3;
3816
3817				for (j = 0; j < 4; j++) {
3818					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3819					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3820                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3821                                        alu.dst.sel = tex.src_gpr;
3822                                        alu.dst.chan = j;
3823                                        if (j == 3)
3824                                                alu.last = 1;
3825                                        alu.dst.write = 1;
3826                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3827                                        if (r)
3828                                                return r;
3829				}
3830
3831			} else {
3832				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3833				tex.src_sel_x = ctx->src[i].swizzle[0];
3834				tex.src_sel_y = ctx->src[i].swizzle[1];
3835				tex.src_sel_z = ctx->src[i].swizzle[2];
3836				tex.src_sel_w = ctx->src[i].swizzle[3];
3837				tex.src_rel = ctx->src[i].rel;
3838			}
3839			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3840			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3841			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3842				tex.coord_type_x = 1;
3843				tex.coord_type_y = 1;
3844				tex.coord_type_z = 1;
3845				tex.coord_type_w = 1;
3846			}
3847			r = r600_bytecode_add_tex(ctx->bc, &tex);
3848			if (r)
3849				return r;
3850		}
3851	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3852		int out_chan;
3853		/* Add perspective divide */
3854		if (ctx->bc->chip_class == CAYMAN) {
3855			out_chan = 2;
3856			for (i = 0; i < 3; i++) {
3857				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3858				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3859				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3860
3861				alu.dst.sel = ctx->temp_reg;
3862				alu.dst.chan = i;
3863				if (i == 2)
3864					alu.last = 1;
3865				if (out_chan == i)
3866					alu.dst.write = 1;
3867				r = r600_bytecode_add_alu(ctx->bc, &alu);
3868				if (r)
3869					return r;
3870			}
3871
3872		} else {
3873			out_chan = 3;
3874			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3875			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3876			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3877
3878			alu.dst.sel = ctx->temp_reg;
3879			alu.dst.chan = out_chan;
3880			alu.last = 1;
3881			alu.dst.write = 1;
3882			r = r600_bytecode_add_alu(ctx->bc, &alu);
3883			if (r)
3884				return r;
3885		}
3886
3887		for (i = 0; i < 3; i++) {
3888			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3889			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3890			alu.src[0].sel = ctx->temp_reg;
3891			alu.src[0].chan = out_chan;
3892			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3893			alu.dst.sel = ctx->temp_reg;
3894			alu.dst.chan = i;
3895			alu.dst.write = 1;
3896			r = r600_bytecode_add_alu(ctx->bc, &alu);
3897			if (r)
3898				return r;
3899		}
3900		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3901		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3902		alu.src[0].sel = V_SQ_ALU_SRC_1;
3903		alu.src[0].chan = 0;
3904		alu.dst.sel = ctx->temp_reg;
3905		alu.dst.chan = 3;
3906		alu.last = 1;
3907		alu.dst.write = 1;
3908		r = r600_bytecode_add_alu(ctx->bc, &alu);
3909		if (r)
3910			return r;
3911		src_loaded = TRUE;
3912		src_gpr = ctx->temp_reg;
3913	}
3914
3915	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3916	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3917	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3918	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3919
3920		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3921		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3922
3923		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3924		for (i = 0; i < 4; i++) {
3925			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3926			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3927			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3928			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3929			alu.dst.sel = ctx->temp_reg;
3930			alu.dst.chan = i;
3931			if (i == 3)
3932				alu.last = 1;
3933			alu.dst.write = 1;
3934			r = r600_bytecode_add_alu(ctx->bc, &alu);
3935			if (r)
3936				return r;
3937		}
3938
3939		/* tmp1.z = RCP_e(|tmp1.z|) */
3940		if (ctx->bc->chip_class == CAYMAN) {
3941			for (i = 0; i < 3; i++) {
3942				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3943				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3944				alu.src[0].sel = ctx->temp_reg;
3945				alu.src[0].chan = 2;
3946				alu.src[0].abs = 1;
3947				alu.dst.sel = ctx->temp_reg;
3948				alu.dst.chan = i;
3949				if (i == 2)
3950					alu.dst.write = 1;
3951				if (i == 2)
3952					alu.last = 1;
3953				r = r600_bytecode_add_alu(ctx->bc, &alu);
3954				if (r)
3955					return r;
3956			}
3957		} else {
3958			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3959			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3960			alu.src[0].sel = ctx->temp_reg;
3961			alu.src[0].chan = 2;
3962			alu.src[0].abs = 1;
3963			alu.dst.sel = ctx->temp_reg;
3964			alu.dst.chan = 2;
3965			alu.dst.write = 1;
3966			alu.last = 1;
3967			r = r600_bytecode_add_alu(ctx->bc, &alu);
3968			if (r)
3969				return r;
3970		}
3971
3972		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3973		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3974		 * muladd has no writemask, have to use another temp
3975		 */
3976		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3977		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3978		alu.is_op3 = 1;
3979
3980		alu.src[0].sel = ctx->temp_reg;
3981		alu.src[0].chan = 0;
3982		alu.src[1].sel = ctx->temp_reg;
3983		alu.src[1].chan = 2;
3984
3985		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3986		alu.src[2].chan = 0;
3987		alu.src[2].value = *(uint32_t *)&one_point_five;
3988
3989		alu.dst.sel = ctx->temp_reg;
3990		alu.dst.chan = 0;
3991		alu.dst.write = 1;
3992
3993		r = r600_bytecode_add_alu(ctx->bc, &alu);
3994		if (r)
3995			return r;
3996
3997		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3998		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3999		alu.is_op3 = 1;
4000
4001		alu.src[0].sel = ctx->temp_reg;
4002		alu.src[0].chan = 1;
4003		alu.src[1].sel = ctx->temp_reg;
4004		alu.src[1].chan = 2;
4005
4006		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4007		alu.src[2].chan = 0;
4008		alu.src[2].value = *(uint32_t *)&one_point_five;
4009
4010		alu.dst.sel = ctx->temp_reg;
4011		alu.dst.chan = 1;
4012		alu.dst.write = 1;
4013
4014		alu.last = 1;
4015		r = r600_bytecode_add_alu(ctx->bc, &alu);
4016		if (r)
4017			return r;
4018		/* write initial W value into Z component */
4019		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4020			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4021			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4022			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4023			alu.dst.sel = ctx->temp_reg;
4024			alu.dst.chan = 2;
4025			alu.dst.write = 1;
4026			alu.last = 1;
4027			r = r600_bytecode_add_alu(ctx->bc, &alu);
4028			if (r)
4029				return r;
4030		}
4031		src_loaded = TRUE;
4032		src_gpr = ctx->temp_reg;
4033	}
4034
4035	if (src_requires_loading && !src_loaded) {
4036		for (i = 0; i < 4; i++) {
4037			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4038			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4039			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4040			alu.dst.sel = ctx->temp_reg;
4041			alu.dst.chan = i;
4042			if (i == 3)
4043				alu.last = 1;
4044			alu.dst.write = 1;
4045			r = r600_bytecode_add_alu(ctx->bc, &alu);
4046			if (r)
4047				return r;
4048		}
4049		src_loaded = TRUE;
4050		src_gpr = ctx->temp_reg;
4051	}
4052
4053	opcode = ctx->inst_info->r600_opcode;
4054	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4055	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4056	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4057	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4058	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4059	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4060		switch (opcode) {
4061		case SQ_TEX_INST_SAMPLE:
4062			opcode = SQ_TEX_INST_SAMPLE_C;
4063			break;
4064		case SQ_TEX_INST_SAMPLE_L:
4065			opcode = SQ_TEX_INST_SAMPLE_C_L;
4066			break;
4067		case SQ_TEX_INST_SAMPLE_LB:
4068			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4069			break;
4070		case SQ_TEX_INST_SAMPLE_G:
4071			opcode = SQ_TEX_INST_SAMPLE_C_G;
4072			break;
4073		}
4074	}
4075
4076	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4077	tex.inst = opcode;
4078
4079	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4080	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4081	tex.src_gpr = src_gpr;
4082	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4083	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4084	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4085	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4086	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4087
4088	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4089		tex.src_sel_x = 4;
4090		tex.src_sel_y = 4;
4091		tex.src_sel_z = 4;
4092		tex.src_sel_w = 4;
4093	} else if (src_loaded) {
4094		tex.src_sel_x = 0;
4095		tex.src_sel_y = 1;
4096		tex.src_sel_z = 2;
4097		tex.src_sel_w = 3;
4098	} else {
4099		tex.src_sel_x = ctx->src[0].swizzle[0];
4100		tex.src_sel_y = ctx->src[0].swizzle[1];
4101		tex.src_sel_z = ctx->src[0].swizzle[2];
4102		tex.src_sel_w = ctx->src[0].swizzle[3];
4103		tex.src_rel = ctx->src[0].rel;
4104	}
4105
4106	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4107		tex.src_sel_x = 1;
4108		tex.src_sel_y = 0;
4109		tex.src_sel_z = 3;
4110		tex.src_sel_w = 1;
4111	}
4112	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4113		tex.src_sel_x = 1;
4114		tex.src_sel_y = 0;
4115		tex.src_sel_z = 3;
4116		tex.src_sel_w = 2; /* route Z compare value into W */
4117	}
4118
4119	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4120	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4121		tex.coord_type_x = 1;
4122		tex.coord_type_y = 1;
4123	}
4124	tex.coord_type_z = 1;
4125	tex.coord_type_w = 1;
4126
4127	tex.offset_x = offset_x;
4128	tex.offset_y = offset_y;
4129	tex.offset_z = offset_z;
4130
4131	/* Put the depth for comparison in W.
4132	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4133	 * Some instructions expect the depth in Z. */
4134	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4135	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4136	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4137	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4138	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4139	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4140		tex.src_sel_w = tex.src_sel_z;
4141	}
4142
4143	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4144	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4145		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4146		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4147			/* the array index is read from Y */
4148			tex.coord_type_y = 0;
4149		} else {
4150			/* the array index is read from Z */
4151			tex.coord_type_z = 0;
4152			tex.src_sel_z = tex.src_sel_y;
4153		}
4154	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4155		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4156		/* the array index is read from Z */
4157		tex.coord_type_z = 0;
4158
4159	r = r600_bytecode_add_tex(ctx->bc, &tex);
4160	if (r)
4161		return r;
4162
4163	/* add shadow ambient support  - gallium doesn't do it yet */
4164	return 0;
4165}
4166
4167static int tgsi_lrp(struct r600_shader_ctx *ctx)
4168{
4169	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4170	struct r600_bytecode_alu alu;
4171	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4172	unsigned i;
4173	int r;
4174
4175	/* optimize if it's just an equal balance */
4176	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4177		for (i = 0; i < lasti + 1; i++) {
4178			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4179				continue;
4180
4181			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4182			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4183			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4184			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4185			alu.omod = 3;
4186			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4187			alu.dst.chan = i;
4188			if (i == lasti) {
4189				alu.last = 1;
4190			}
4191			r = r600_bytecode_add_alu(ctx->bc, &alu);
4192			if (r)
4193				return r;
4194		}
4195		return 0;
4196	}
4197
4198	/* 1 - src0 */
4199	for (i = 0; i < lasti + 1; i++) {
4200		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4201			continue;
4202
4203		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4204		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4205		alu.src[0].sel = V_SQ_ALU_SRC_1;
4206		alu.src[0].chan = 0;
4207		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4208		r600_bytecode_src_toggle_neg(&alu.src[1]);
4209		alu.dst.sel = ctx->temp_reg;
4210		alu.dst.chan = i;
4211		if (i == lasti) {
4212			alu.last = 1;
4213		}
4214		alu.dst.write = 1;
4215		r = r600_bytecode_add_alu(ctx->bc, &alu);
4216		if (r)
4217			return r;
4218	}
4219
4220	/* (1 - src0) * src2 */
4221	for (i = 0; i < lasti + 1; i++) {
4222		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4223			continue;
4224
4225		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4226		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4227		alu.src[0].sel = ctx->temp_reg;
4228		alu.src[0].chan = i;
4229		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4230		alu.dst.sel = ctx->temp_reg;
4231		alu.dst.chan = i;
4232		if (i == lasti) {
4233			alu.last = 1;
4234		}
4235		alu.dst.write = 1;
4236		r = r600_bytecode_add_alu(ctx->bc, &alu);
4237		if (r)
4238			return r;
4239	}
4240
4241	/* src0 * src1 + (1 - src0) * src2 */
4242	for (i = 0; i < lasti + 1; i++) {
4243		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4244			continue;
4245
4246		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4247		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4248		alu.is_op3 = 1;
4249		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4250		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4251		alu.src[2].sel = ctx->temp_reg;
4252		alu.src[2].chan = i;
4253
4254		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4255		alu.dst.chan = i;
4256		if (i == lasti) {
4257			alu.last = 1;
4258		}
4259		r = r600_bytecode_add_alu(ctx->bc, &alu);
4260		if (r)
4261			return r;
4262	}
4263	return 0;
4264}
4265
4266static int tgsi_cmp(struct r600_shader_ctx *ctx)
4267{
4268	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4269	struct r600_bytecode_alu alu;
4270	int i, r;
4271	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4272
4273	for (i = 0; i < lasti + 1; i++) {
4274		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4275			continue;
4276
4277		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4278		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4279		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4280		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4281		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4282		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4283		alu.dst.chan = i;
4284		alu.dst.write = 1;
4285		alu.is_op3 = 1;
4286		if (i == lasti)
4287			alu.last = 1;
4288		r = r600_bytecode_add_alu(ctx->bc, &alu);
4289		if (r)
4290			return r;
4291	}
4292	return 0;
4293}
4294
4295static int tgsi_xpd(struct r600_shader_ctx *ctx)
4296{
4297	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4298	static const unsigned int src0_swizzle[] = {2, 0, 1};
4299	static const unsigned int src1_swizzle[] = {1, 2, 0};
4300	struct r600_bytecode_alu alu;
4301	uint32_t use_temp = 0;
4302	int i, r;
4303
4304	if (inst->Dst[0].Register.WriteMask != 0xf)
4305		use_temp = 1;
4306
4307	for (i = 0; i < 4; i++) {
4308		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4309		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4310		if (i < 3) {
4311			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4312			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4313		} else {
4314			alu.src[0].sel = V_SQ_ALU_SRC_0;
4315			alu.src[0].chan = i;
4316			alu.src[1].sel = V_SQ_ALU_SRC_0;
4317			alu.src[1].chan = i;
4318		}
4319
4320		alu.dst.sel = ctx->temp_reg;
4321		alu.dst.chan = i;
4322		alu.dst.write = 1;
4323
4324		if (i == 3)
4325			alu.last = 1;
4326		r = r600_bytecode_add_alu(ctx->bc, &alu);
4327		if (r)
4328			return r;
4329	}
4330
4331	for (i = 0; i < 4; i++) {
4332		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4333		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4334
4335		if (i < 3) {
4336			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4337			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4338		} else {
4339			alu.src[0].sel = V_SQ_ALU_SRC_0;
4340			alu.src[0].chan = i;
4341			alu.src[1].sel = V_SQ_ALU_SRC_0;
4342			alu.src[1].chan = i;
4343		}
4344
4345		alu.src[2].sel = ctx->temp_reg;
4346		alu.src[2].neg = 1;
4347		alu.src[2].chan = i;
4348
4349		if (use_temp)
4350			alu.dst.sel = ctx->temp_reg;
4351		else
4352			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4353		alu.dst.chan = i;
4354		alu.dst.write = 1;
4355		alu.is_op3 = 1;
4356		if (i == 3)
4357			alu.last = 1;
4358		r = r600_bytecode_add_alu(ctx->bc, &alu);
4359		if (r)
4360			return r;
4361	}
4362	if (use_temp)
4363		return tgsi_helper_copy(ctx, inst);
4364	return 0;
4365}
4366
4367static int tgsi_exp(struct r600_shader_ctx *ctx)
4368{
4369	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4370	struct r600_bytecode_alu alu;
4371	int r;
4372	int i;
4373
4374	/* result.x = 2^floor(src); */
4375	if (inst->Dst[0].Register.WriteMask & 1) {
4376		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4377
4378		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4379		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4380
4381		alu.dst.sel = ctx->temp_reg;
4382		alu.dst.chan = 0;
4383		alu.dst.write = 1;
4384		alu.last = 1;
4385		r = r600_bytecode_add_alu(ctx->bc, &alu);
4386		if (r)
4387			return r;
4388
4389		if (ctx->bc->chip_class == CAYMAN) {
4390			for (i = 0; i < 3; i++) {
4391				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4392				alu.src[0].sel = ctx->temp_reg;
4393				alu.src[0].chan = 0;
4394
4395				alu.dst.sel = ctx->temp_reg;
4396				alu.dst.chan = i;
4397				alu.dst.write = i == 0;
4398				alu.last = i == 2;
4399				r = r600_bytecode_add_alu(ctx->bc, &alu);
4400				if (r)
4401					return r;
4402			}
4403		} else {
4404			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4405			alu.src[0].sel = ctx->temp_reg;
4406			alu.src[0].chan = 0;
4407
4408			alu.dst.sel = ctx->temp_reg;
4409			alu.dst.chan = 0;
4410			alu.dst.write = 1;
4411			alu.last = 1;
4412			r = r600_bytecode_add_alu(ctx->bc, &alu);
4413			if (r)
4414				return r;
4415		}
4416	}
4417
4418	/* result.y = tmp - floor(tmp); */
4419	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4420		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4421
4422		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4423		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4424
4425		alu.dst.sel = ctx->temp_reg;
4426#if 0
4427		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4428		if (r)
4429			return r;
4430#endif
4431		alu.dst.write = 1;
4432		alu.dst.chan = 1;
4433
4434		alu.last = 1;
4435
4436		r = r600_bytecode_add_alu(ctx->bc, &alu);
4437		if (r)
4438			return r;
4439	}
4440
4441	/* result.z = RoughApprox2ToX(tmp);*/
4442	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4443		if (ctx->bc->chip_class == CAYMAN) {
4444			for (i = 0; i < 3; i++) {
4445				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4446				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4447				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4448
4449				alu.dst.sel = ctx->temp_reg;
4450				alu.dst.chan = i;
4451				if (i == 2) {
4452					alu.dst.write = 1;
4453					alu.last = 1;
4454				}
4455
4456				r = r600_bytecode_add_alu(ctx->bc, &alu);
4457				if (r)
4458					return r;
4459			}
4460		} else {
4461			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4462			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4463			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4464
4465			alu.dst.sel = ctx->temp_reg;
4466			alu.dst.write = 1;
4467			alu.dst.chan = 2;
4468
4469			alu.last = 1;
4470
4471			r = r600_bytecode_add_alu(ctx->bc, &alu);
4472			if (r)
4473				return r;
4474		}
4475	}
4476
4477	/* result.w = 1.0;*/
4478	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4479		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4480
4481		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4482		alu.src[0].sel = V_SQ_ALU_SRC_1;
4483		alu.src[0].chan = 0;
4484
4485		alu.dst.sel = ctx->temp_reg;
4486		alu.dst.chan = 3;
4487		alu.dst.write = 1;
4488		alu.last = 1;
4489		r = r600_bytecode_add_alu(ctx->bc, &alu);
4490		if (r)
4491			return r;
4492	}
4493	return tgsi_helper_copy(ctx, inst);
4494}
4495
4496static int tgsi_log(struct r600_shader_ctx *ctx)
4497{
4498	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4499	struct r600_bytecode_alu alu;
4500	int r;
4501	int i;
4502
4503	/* result.x = floor(log2(|src|)); */
4504	if (inst->Dst[0].Register.WriteMask & 1) {
4505		if (ctx->bc->chip_class == CAYMAN) {
4506			for (i = 0; i < 3; i++) {
4507				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4508
4509				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4510				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4511				r600_bytecode_src_set_abs(&alu.src[0]);
4512
4513				alu.dst.sel = ctx->temp_reg;
4514				alu.dst.chan = i;
4515				if (i == 0)
4516					alu.dst.write = 1;
4517				if (i == 2)
4518					alu.last = 1;
4519				r = r600_bytecode_add_alu(ctx->bc, &alu);
4520				if (r)
4521					return r;
4522			}
4523
4524		} else {
4525			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4526
4527			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4528			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4529			r600_bytecode_src_set_abs(&alu.src[0]);
4530
4531			alu.dst.sel = ctx->temp_reg;
4532			alu.dst.chan = 0;
4533			alu.dst.write = 1;
4534			alu.last = 1;
4535			r = r600_bytecode_add_alu(ctx->bc, &alu);
4536			if (r)
4537				return r;
4538		}
4539
4540		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4541		alu.src[0].sel = ctx->temp_reg;
4542		alu.src[0].chan = 0;
4543
4544		alu.dst.sel = ctx->temp_reg;
4545		alu.dst.chan = 0;
4546		alu.dst.write = 1;
4547		alu.last = 1;
4548
4549		r = r600_bytecode_add_alu(ctx->bc, &alu);
4550		if (r)
4551			return r;
4552	}
4553
4554	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4555	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4556
4557		if (ctx->bc->chip_class == CAYMAN) {
4558			for (i = 0; i < 3; i++) {
4559				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4560
4561				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4562				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4563				r600_bytecode_src_set_abs(&alu.src[0]);
4564
4565				alu.dst.sel = ctx->temp_reg;
4566				alu.dst.chan = i;
4567				if (i == 1)
4568					alu.dst.write = 1;
4569				if (i == 2)
4570					alu.last = 1;
4571
4572				r = r600_bytecode_add_alu(ctx->bc, &alu);
4573				if (r)
4574					return r;
4575			}
4576		} else {
4577			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4578
4579			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4580			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4581			r600_bytecode_src_set_abs(&alu.src[0]);
4582
4583			alu.dst.sel = ctx->temp_reg;
4584			alu.dst.chan = 1;
4585			alu.dst.write = 1;
4586			alu.last = 1;
4587
4588			r = r600_bytecode_add_alu(ctx->bc, &alu);
4589			if (r)
4590				return r;
4591		}
4592
4593		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4594
4595		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4596		alu.src[0].sel = ctx->temp_reg;
4597		alu.src[0].chan = 1;
4598
4599		alu.dst.sel = ctx->temp_reg;
4600		alu.dst.chan = 1;
4601		alu.dst.write = 1;
4602		alu.last = 1;
4603
4604		r = r600_bytecode_add_alu(ctx->bc, &alu);
4605		if (r)
4606			return r;
4607
4608		if (ctx->bc->chip_class == CAYMAN) {
4609			for (i = 0; i < 3; i++) {
4610				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4611				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4612				alu.src[0].sel = ctx->temp_reg;
4613				alu.src[0].chan = 1;
4614
4615				alu.dst.sel = ctx->temp_reg;
4616				alu.dst.chan = i;
4617				if (i == 1)
4618					alu.dst.write = 1;
4619				if (i == 2)
4620					alu.last = 1;
4621
4622				r = r600_bytecode_add_alu(ctx->bc, &alu);
4623				if (r)
4624					return r;
4625			}
4626		} else {
4627			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4628			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4629			alu.src[0].sel = ctx->temp_reg;
4630			alu.src[0].chan = 1;
4631
4632			alu.dst.sel = ctx->temp_reg;
4633			alu.dst.chan = 1;
4634			alu.dst.write = 1;
4635			alu.last = 1;
4636
4637			r = r600_bytecode_add_alu(ctx->bc, &alu);
4638			if (r)
4639				return r;
4640		}
4641
4642		if (ctx->bc->chip_class == CAYMAN) {
4643			for (i = 0; i < 3; i++) {
4644				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4645				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4646				alu.src[0].sel = ctx->temp_reg;
4647				alu.src[0].chan = 1;
4648
4649				alu.dst.sel = ctx->temp_reg;
4650				alu.dst.chan = i;
4651				if (i == 1)
4652					alu.dst.write = 1;
4653				if (i == 2)
4654					alu.last = 1;
4655
4656				r = r600_bytecode_add_alu(ctx->bc, &alu);
4657				if (r)
4658					return r;
4659			}
4660		} else {
4661			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4662			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4663			alu.src[0].sel = ctx->temp_reg;
4664			alu.src[0].chan = 1;
4665
4666			alu.dst.sel = ctx->temp_reg;
4667			alu.dst.chan = 1;
4668			alu.dst.write = 1;
4669			alu.last = 1;
4670
4671			r = r600_bytecode_add_alu(ctx->bc, &alu);
4672			if (r)
4673				return r;
4674		}
4675
4676		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4677
4678		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4679
4680		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4681		r600_bytecode_src_set_abs(&alu.src[0]);
4682
4683		alu.src[1].sel = ctx->temp_reg;
4684		alu.src[1].chan = 1;
4685
4686		alu.dst.sel = ctx->temp_reg;
4687		alu.dst.chan = 1;
4688		alu.dst.write = 1;
4689		alu.last = 1;
4690
4691		r = r600_bytecode_add_alu(ctx->bc, &alu);
4692		if (r)
4693			return r;
4694	}
4695
4696	/* result.z = log2(|src|);*/
4697	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4698		if (ctx->bc->chip_class == CAYMAN) {
4699			for (i = 0; i < 3; i++) {
4700				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4701
4702				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4703				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4704				r600_bytecode_src_set_abs(&alu.src[0]);
4705
4706				alu.dst.sel = ctx->temp_reg;
4707				if (i == 2)
4708					alu.dst.write = 1;
4709				alu.dst.chan = i;
4710				if (i == 2)
4711					alu.last = 1;
4712
4713				r = r600_bytecode_add_alu(ctx->bc, &alu);
4714				if (r)
4715					return r;
4716			}
4717		} else {
4718			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4719
4720			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4721			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4722			r600_bytecode_src_set_abs(&alu.src[0]);
4723
4724			alu.dst.sel = ctx->temp_reg;
4725			alu.dst.write = 1;
4726			alu.dst.chan = 2;
4727			alu.last = 1;
4728
4729			r = r600_bytecode_add_alu(ctx->bc, &alu);
4730			if (r)
4731				return r;
4732		}
4733	}
4734
4735	/* result.w = 1.0; */
4736	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4737		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4738
4739		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4740		alu.src[0].sel = V_SQ_ALU_SRC_1;
4741		alu.src[0].chan = 0;
4742
4743		alu.dst.sel = ctx->temp_reg;
4744		alu.dst.chan = 3;
4745		alu.dst.write = 1;
4746		alu.last = 1;
4747
4748		r = r600_bytecode_add_alu(ctx->bc, &alu);
4749		if (r)
4750			return r;
4751	}
4752
4753	return tgsi_helper_copy(ctx, inst);
4754}
4755
4756static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4757{
4758	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4759	struct r600_bytecode_alu alu;
4760	int r;
4761
4762	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4763
4764	switch (inst->Instruction.Opcode) {
4765	case TGSI_OPCODE_ARL:
4766		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4767		break;
4768	case TGSI_OPCODE_ARR:
4769		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4770		break;
4771	case TGSI_OPCODE_UARL:
4772		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4773		break;
4774	default:
4775		assert(0);
4776		return -1;
4777	}
4778
4779	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4780	alu.last = 1;
4781	alu.dst.sel = ctx->bc->ar_reg;
4782	alu.dst.write = 1;
4783	r = r600_bytecode_add_alu(ctx->bc, &alu);
4784	if (r)
4785		return r;
4786
4787	ctx->bc->ar_loaded = 0;
4788	return 0;
4789}
4790static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4791{
4792	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4793	struct r600_bytecode_alu alu;
4794	int r;
4795
4796	switch (inst->Instruction.Opcode) {
4797	case TGSI_OPCODE_ARL:
4798		memset(&alu, 0, sizeof(alu));
4799		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4800		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4801		alu.dst.sel = ctx->bc->ar_reg;
4802		alu.dst.write = 1;
4803		alu.last = 1;
4804
4805		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4806			return r;
4807
4808		memset(&alu, 0, sizeof(alu));
4809		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4810		alu.src[0].sel = ctx->bc->ar_reg;
4811		alu.dst.sel = ctx->bc->ar_reg;
4812		alu.dst.write = 1;
4813		alu.last = 1;
4814
4815		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4816			return r;
4817		break;
4818	case TGSI_OPCODE_ARR:
4819		memset(&alu, 0, sizeof(alu));
4820		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4821		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4822		alu.dst.sel = ctx->bc->ar_reg;
4823		alu.dst.write = 1;
4824		alu.last = 1;
4825
4826		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4827			return r;
4828		break;
4829	case TGSI_OPCODE_UARL:
4830		memset(&alu, 0, sizeof(alu));
4831		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4832		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4833		alu.dst.sel = ctx->bc->ar_reg;
4834		alu.dst.write = 1;
4835		alu.last = 1;
4836
4837		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4838			return r;
4839		break;
4840	default:
4841		assert(0);
4842		return -1;
4843	}
4844
4845	ctx->bc->ar_loaded = 0;
4846	return 0;
4847}
4848
4849static int tgsi_opdst(struct r600_shader_ctx *ctx)
4850{
4851	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4852	struct r600_bytecode_alu alu;
4853	int i, r = 0;
4854
4855	for (i = 0; i < 4; i++) {
4856		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4857
4858		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4859		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4860
4861		if (i == 0 || i == 3) {
4862			alu.src[0].sel = V_SQ_ALU_SRC_1;
4863		} else {
4864			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4865		}
4866
4867		if (i == 0 || i == 2) {
4868			alu.src[1].sel = V_SQ_ALU_SRC_1;
4869		} else {
4870			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4871		}
4872		if (i == 3)
4873			alu.last = 1;
4874		r = r600_bytecode_add_alu(ctx->bc, &alu);
4875		if (r)
4876			return r;
4877	}
4878	return 0;
4879}
4880
4881static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4882{
4883	struct r600_bytecode_alu alu;
4884	int r;
4885
4886	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4887	alu.inst = opcode;
4888	alu.execute_mask = 1;
4889	alu.update_pred = 1;
4890
4891	alu.dst.sel = ctx->temp_reg;
4892	alu.dst.write = 1;
4893	alu.dst.chan = 0;
4894
4895	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4896	alu.src[1].sel = V_SQ_ALU_SRC_0;
4897	alu.src[1].chan = 0;
4898
4899	alu.last = 1;
4900
4901	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4902	if (r)
4903		return r;
4904	return 0;
4905}
4906
4907static int pops(struct r600_shader_ctx *ctx, int pops)
4908{
4909	unsigned force_pop = ctx->bc->force_add_cf;
4910
4911	if (!force_pop) {
4912		int alu_pop = 3;
4913		if (ctx->bc->cf_last) {
4914			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4915				alu_pop = 0;
4916			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4917				alu_pop = 1;
4918		}
4919		alu_pop += pops;
4920		if (alu_pop == 1) {
4921			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4922			ctx->bc->force_add_cf = 1;
4923		} else if (alu_pop == 2) {
4924			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4925			ctx->bc->force_add_cf = 1;
4926		} else {
4927			force_pop = 1;
4928		}
4929	}
4930
4931	if (force_pop) {
4932		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4933		ctx->bc->cf_last->pop_count = pops;
4934		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4935	}
4936
4937	return 0;
4938}
4939
4940static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4941{
4942	switch(reason) {
4943	case FC_PUSH_VPM:
4944		ctx->bc->callstack[ctx->bc->call_sp].current--;
4945		break;
4946	case FC_PUSH_WQM:
4947	case FC_LOOP:
4948		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4949		break;
4950	case FC_REP:
4951		/* TOODO : for 16 vp asic should -= 2; */
4952		ctx->bc->callstack[ctx->bc->call_sp].current --;
4953		break;
4954	}
4955}
4956
4957static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4958{
4959	if (check_max_only) {
4960		int diff;
4961		switch (reason) {
4962		case FC_PUSH_VPM:
4963			diff = 1;
4964			break;
4965		case FC_PUSH_WQM:
4966			diff = 4;
4967			break;
4968		default:
4969			assert(0);
4970			diff = 0;
4971		}
4972		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4973		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4974			ctx->bc->callstack[ctx->bc->call_sp].max =
4975				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4976		}
4977		return;
4978	}
4979	switch (reason) {
4980	case FC_PUSH_VPM:
4981		ctx->bc->callstack[ctx->bc->call_sp].current++;
4982		break;
4983	case FC_PUSH_WQM:
4984	case FC_LOOP:
4985		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
4986		break;
4987	case FC_REP:
4988		ctx->bc->callstack[ctx->bc->call_sp].current++;
4989		break;
4990	}
4991
4992	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
4993	    ctx->bc->callstack[ctx->bc->call_sp].max) {
4994		ctx->bc->callstack[ctx->bc->call_sp].max =
4995			ctx->bc->callstack[ctx->bc->call_sp].current;
4996	}
4997}
4998
4999static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
5000{
5001	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
5002
5003	sp->mid = realloc((void *)sp->mid,
5004						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5005	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5006	sp->num_mid++;
5007}
5008
5009static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5010{
5011	ctx->bc->fc_sp++;
5012	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5013	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5014}
5015
5016static void fc_poplevel(struct r600_shader_ctx *ctx)
5017{
5018	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5019	free(sp->mid);
5020	sp->mid = NULL;
5021	sp->num_mid = 0;
5022	sp->start = NULL;
5023	sp->type = 0;
5024	ctx->bc->fc_sp--;
5025}
5026
5027#if 0
5028static int emit_return(struct r600_shader_ctx *ctx)
5029{
5030	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5031	return 0;
5032}
5033
5034static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5035{
5036
5037	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5038	ctx->bc->cf_last->pop_count = pops;
5039	/* XXX work out offset */
5040	return 0;
5041}
5042
5043static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5044{
5045	return 0;
5046}
5047
5048static void emit_testflag(struct r600_shader_ctx *ctx)
5049{
5050
5051}
5052
5053static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5054{
5055	emit_testflag(ctx);
5056	emit_jump_to_offset(ctx, 1, 4);
5057	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5058	pops(ctx, ifidx + 1);
5059	emit_return(ctx);
5060}
5061
5062static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5063{
5064	emit_testflag(ctx);
5065
5066	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5067	ctx->bc->cf_last->pop_count = 1;
5068
5069	fc_set_mid(ctx, fc_sp);
5070
5071	pops(ctx, 1);
5072}
5073#endif
5074
5075static int tgsi_if(struct r600_shader_ctx *ctx)
5076{
5077	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5078
5079	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5080
5081	fc_pushlevel(ctx, FC_IF);
5082
5083	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5084	return 0;
5085}
5086
5087static int tgsi_else(struct r600_shader_ctx *ctx)
5088{
5089	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5090	ctx->bc->cf_last->pop_count = 1;
5091
5092	fc_set_mid(ctx, ctx->bc->fc_sp);
5093	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5094	return 0;
5095}
5096
5097static int tgsi_endif(struct r600_shader_ctx *ctx)
5098{
5099	pops(ctx, 1);
5100	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5101		R600_ERR("if/endif unbalanced in shader\n");
5102		return -1;
5103	}
5104
5105	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5106		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5107		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5108	} else {
5109		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5110	}
5111	fc_poplevel(ctx);
5112
5113	callstack_decrease_current(ctx, FC_PUSH_VPM);
5114	return 0;
5115}
5116
5117static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5118{
5119	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
5120	 * limited to 4096 iterations, like the other LOOP_* instructions. */
5121	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
5122
5123	fc_pushlevel(ctx, FC_LOOP);
5124
5125	/* check stack depth */
5126	callstack_check_depth(ctx, FC_LOOP, 0);
5127	return 0;
5128}
5129
5130static int tgsi_endloop(struct r600_shader_ctx *ctx)
5131{
5132	int i;
5133
5134	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5135
5136	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5137		R600_ERR("loop/endloop in shader code are not paired.\n");
5138		return -EINVAL;
5139	}
5140
5141	/* fixup loop pointers - from r600isa
5142	   LOOP END points to CF after LOOP START,
5143	   LOOP START point to CF after LOOP END
5144	   BRK/CONT point to LOOP END CF
5145	*/
5146	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5147
5148	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5149
5150	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5151		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5152	}
5153	/* XXX add LOOPRET support */
5154	fc_poplevel(ctx);
5155	callstack_decrease_current(ctx, FC_LOOP);
5156	return 0;
5157}
5158
5159static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5160{
5161	unsigned int fscp;
5162
5163	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5164	{
5165		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5166			break;
5167	}
5168
5169	if (fscp == 0) {
5170		R600_ERR("Break not inside loop/endloop pair\n");
5171		return -EINVAL;
5172	}
5173
5174	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5175
5176	fc_set_mid(ctx, fscp);
5177
5178	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5179	return 0;
5180}
5181
5182static int tgsi_umad(struct r600_shader_ctx *ctx)
5183{
5184	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5185	struct r600_bytecode_alu alu;
5186	int i, j, r;
5187	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5188
5189	/* src0 * src1 */
5190	for (i = 0; i < lasti + 1; i++) {
5191		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5192			continue;
5193
5194		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5195
5196		alu.dst.chan = i;
5197		alu.dst.sel = ctx->temp_reg;
5198		alu.dst.write = 1;
5199
5200		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5201		for (j = 0; j < 2; j++) {
5202		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5203		}
5204
5205		alu.last = 1;
5206		r = r600_bytecode_add_alu(ctx->bc, &alu);
5207		if (r)
5208			return r;
5209	}
5210
5211
5212	for (i = 0; i < lasti + 1; i++) {
5213		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5214			continue;
5215
5216		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5217		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5218
5219		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5220
5221		alu.src[0].sel = ctx->temp_reg;
5222		alu.src[0].chan = i;
5223
5224		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5225		if (i == lasti) {
5226			alu.last = 1;
5227		}
5228		r = r600_bytecode_add_alu(ctx->bc, &alu);
5229		if (r)
5230			return r;
5231	}
5232	return 0;
5233}
5234
5235static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5236	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5237	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5238	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5239
5240	/* XXX:
5241	 * For state trackers other than OpenGL, we'll want to use
5242	 * _RECIP_IEEE instead.
5243	 */
5244	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5245
5246	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5247	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5248	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5249	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5250	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5251	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5252	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5253	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5254	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5255	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5256	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5257	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5258	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5259	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5260	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5261	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5262	/* gap */
5263	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5264	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5265	/* gap */
5266	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5267	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5268	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5269	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5270	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5271	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5272	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5273	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5274	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5275	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5276	/* gap */
5277	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5278	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5279	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5280	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5281	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5282	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5283	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5284	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5285	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5286	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5287	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5288	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5289	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5290	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5291	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5292	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5293	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5294	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5295	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5296	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5298	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5299	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5300	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5301	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5302	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5303	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5304	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5305	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5306	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5307	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5308	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5309	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5310	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5311	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5312	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5313	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5314	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5315	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5316	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5317	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5318	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5319	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5320	/* gap */
5321	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5322	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5323	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5324	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5325	/* gap */
5326	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5327	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5328	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5329	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5330	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5331	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5332	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5333	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5334	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5335	/* gap */
5336	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5337	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5338	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5339	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5340	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5341	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5342	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5343	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5344	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5345	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5346	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5347	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5348	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5349	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5350	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5351	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5352	/* gap */
5353	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5354	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5355	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5356	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5357	/* gap */
5358	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5359	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5360	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5361	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5362	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5363	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5364	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5365	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5366	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5367	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5368	/* gap */
5369	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5370	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5371	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5372	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5373	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5374	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5375	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5376	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5377	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5378	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5379	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5380	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5381	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5382	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5383	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5384	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5385	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5386	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5387	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5388	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5389	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5390	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5391	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5392	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5393	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5394	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5395	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5396	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5397	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5398	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5399	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5400	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5401	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5402	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5403	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5404	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5405	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5406	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5407	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5408	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5409	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5410	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5411	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5412	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5413};
5414
5415static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5416	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5417	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5418	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5419	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5420	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5421	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5422	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5423	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5424	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5425	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5426	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5427	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5428	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5429	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5430	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5431	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5432	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5433	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5434	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5435	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5436	/* gap */
5437	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5438	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5439	/* gap */
5440	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5441	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5442	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5443	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5444	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5445	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5446	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5447	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5448	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5449	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5450	/* gap */
5451	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5452	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5453	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5454	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5455	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5456	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5457	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5458	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5459	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5460	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5461	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5462	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5463	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5464	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5465	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5466	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5467	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5468	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5469	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5470	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5472	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5473	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5474	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5475	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5476	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5477	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5478	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5479	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5480	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5481	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5482	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5483	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5484	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5485	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5486	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5487	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5488	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5489	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5490	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5491	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5492	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5493	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5494	/* gap */
5495	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5496	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5498	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5499	/* gap */
5500	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5502	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5503	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5504	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5505	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5506	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5507	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5508	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5509	/* gap */
5510	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5511	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5512	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5513	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5514	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5515	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5516	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5517	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5518	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5519	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5520	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5521	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5522	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5523	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5524	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5525	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5526	/* gap */
5527	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5528	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5529	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5530	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5531	/* gap */
5532	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5533	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5534	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5535	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5536	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5537	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5538	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5539	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5540	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5541	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5542	/* gap */
5543	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5544	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5545	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5546	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5547	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5548	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5549	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5550	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5551	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5552	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5553	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5554	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5555	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5556	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5557	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5558	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5559	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5560	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5561	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5562	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5563	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5564	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5565	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5566	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5567	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5568	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5569	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5570	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5571	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5572	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5573	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5574	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5575	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5576	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5577	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5578	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5579	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5580	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5581	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5582	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5583	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5584	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5585	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5586	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5587};
5588
5589static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5590	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5591	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5592	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5593	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5594	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5595	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5596	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5597	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5598	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5599	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5600	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5601	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5602	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5603	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5604	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5605	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5606	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5607	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5608	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5609	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5610	/* gap */
5611	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5612	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5613	/* gap */
5614	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5615	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5616	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5617	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5618	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5619	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5620	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5621	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5622	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5623	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5624	/* gap */
5625	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5626	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5627	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5628	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5629	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5630	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5631	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5632	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5633	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5634	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5635	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5636	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5637	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5638	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5639	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5640	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5641	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5642	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5643	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5644	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5646	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5647	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5648	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5649	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5650	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5651	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5652	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5653	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5654	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5655	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5656	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5657	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5658	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5659	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5660	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5661	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5662	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5663	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5664	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5665	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5666	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5667	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5668	/* gap */
5669	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5670	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5672	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5673	/* gap */
5674	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5676	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5677	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5678	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5679	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5680	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5681	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5682	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5683	/* gap */
5684	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5685	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5686	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5687	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5688	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5689	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5690	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5691	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5692	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5693	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5694	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5695	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5696	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5697	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5698	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5699	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5700	/* gap */
5701	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5702	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5703	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5704	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5705	/* gap */
5706	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5707	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5708	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5709	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5710	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5711	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5712	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5713	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5714	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5715	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5716	/* gap */
5717	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5718	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5719	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5720	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5721	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5722	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5723	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5724	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5725	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5726	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5727	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5728	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5729	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5730	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5731	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5732	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5733	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5734	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5735	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5736	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5737	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5738	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5739	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5740	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5741	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5742	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5743	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5744	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5745	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5746	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5747	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5748	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5749	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5750	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5751	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5752	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5753	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5754	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5755	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5756	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5757	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5758	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5759	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5760	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5761};
5762