r600_shader.c revision 8bf7044ec6ab041ebeb1db9ebe378f8e1f455f4e
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_screen *rscreen,
107				 struct r600_pipe_shader *pipeshader,
108				 struct r600_shader_key key);
109
110int r600_pipe_shader_create(struct pipe_context *ctx,
111			    struct r600_pipe_shader *shader,
112			    struct r600_shader_key key)
113{
114	static int dump_shaders = -1;
115	struct r600_context *rctx = (struct r600_context *)ctx;
116	struct r600_pipe_shader_selector *sel = shader->selector;
117	int r;
118
119	/* Would like some magic "get_bool_option_once" routine.
120	*/
121	if (dump_shaders == -1)
122		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
123
124	if (dump_shaders) {
125		fprintf(stderr, "--------------------------------------------------------------\n");
126		tgsi_dump(sel->tokens, 0);
127
128		if (sel->so.num_outputs) {
129			unsigned i;
130			fprintf(stderr, "STREAMOUT\n");
131			for (i = 0; i < sel->so.num_outputs; i++) {
132				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
133						sel->so.output[i].start_component;
134				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
135					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
136				        mask & 1 ? "x" : "_",
137				        (mask >> 1) & 1 ? "y" : "_",
138				        (mask >> 2) & 1 ? "z" : "_",
139				        (mask >> 3) & 1 ? "w" : "_");
140			}
141		}
142	}
143	r = r600_shader_from_tgsi(rctx->screen, shader, key);
144	if (r) {
145		R600_ERR("translation from TGSI failed !\n");
146		return r;
147	}
148	r = r600_bytecode_build(&shader->shader.bc);
149	if (r) {
150		R600_ERR("building bytecode failed !\n");
151		return r;
152	}
153	if (dump_shaders) {
154		r600_bytecode_dump(&shader->shader.bc);
155		fprintf(stderr, "______________________________________________________________\n");
156	}
157	return r600_pipe_shader(ctx, shader);
158}
159
160void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
161{
162	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
163	r600_bytecode_clear(&shader->shader.bc);
164}
165
166/*
167 * tgsi -> r600 shader
168 */
169struct r600_shader_tgsi_instruction;
170
171struct r600_shader_src {
172	unsigned				sel;
173	unsigned				swizzle[4];
174	unsigned				neg;
175	unsigned				abs;
176	unsigned				rel;
177	uint32_t				value[4];
178};
179
180struct r600_shader_ctx {
181	struct tgsi_shader_info			info;
182	struct tgsi_parse_context		parse;
183	const struct tgsi_token			*tokens;
184	unsigned				type;
185	unsigned				file_offset[TGSI_FILE_COUNT];
186	unsigned				temp_reg;
187	struct r600_shader_tgsi_instruction	*inst_info;
188	struct r600_bytecode			*bc;
189	struct r600_shader			*shader;
190	struct r600_shader_src			src[4];
191	uint32_t				*literals;
192	uint32_t				nliterals;
193	uint32_t				max_driver_temp_used;
194	/* needed for evergreen interpolation */
195	boolean                                 input_centroid;
196	boolean                                 input_linear;
197	boolean                                 input_perspective;
198	int					num_interp_gpr;
199	int					face_gpr;
200	int					colors_used;
201	boolean                 clip_vertex_write;
202	unsigned                cv_output;
203	int					fragcoord_input;
204	int					native_integers;
205};
206
207struct r600_shader_tgsi_instruction {
208	unsigned	tgsi_opcode;
209	unsigned	is_op3;
210	unsigned	r600_opcode;
211	int (*process)(struct r600_shader_ctx *ctx);
212};
213
214static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
215static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
216static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
217static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
218static int tgsi_else(struct r600_shader_ctx *ctx);
219static int tgsi_endif(struct r600_shader_ctx *ctx);
220static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
221static int tgsi_endloop(struct r600_shader_ctx *ctx);
222static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
223
224/*
225 * bytestream -> r600 shader
226 *
227 * These functions are used to transform the output of the LLVM backend into
228 * struct r600_bytecode.
229 */
230
231static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
232				unsigned char * bytes,	unsigned num_bytes);
233
234#ifdef HAVE_OPENCL
235int r600_compute_shader_create(struct pipe_context * ctx,
236	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
237{
238	struct r600_context *r600_ctx = (struct r600_context *)ctx;
239	unsigned char * bytes;
240	unsigned byte_count;
241	struct r600_shader_ctx shader_ctx;
242	unsigned dump = 0;
243
244	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
245		dump = 1;
246	}
247
248	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
249	shader_ctx.bc = bytecode;
250	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
251	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
252	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
253	if (shader_ctx.bc->chip_class == CAYMAN) {
254		cm_bytecode_add_cf_end(shader_ctx.bc);
255	}
256	r600_bytecode_build(shader_ctx.bc);
257	if (dump) {
258		r600_bytecode_dump(shader_ctx.bc);
259	}
260	free(bytes);
261	return 1;
262}
263
264#endif /* HAVE_OPENCL */
265
266static uint32_t i32_from_byte_stream(unsigned char * bytes,
267		unsigned * bytes_read)
268{
269	unsigned i;
270	uint32_t out = 0;
271	for (i = 0; i < 4; i++) {
272		out |= bytes[(*bytes_read)++] << (8 * i);
273	}
274	return out;
275}
276
277static unsigned r600_src_from_byte_stream(unsigned char * bytes,
278		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
279{
280	unsigned i;
281	unsigned sel0, sel1;
282	sel0 = bytes[bytes_read++];
283	sel1 = bytes[bytes_read++];
284	alu->src[src_idx].sel = sel0 | (sel1 << 8);
285	alu->src[src_idx].chan = bytes[bytes_read++];
286	alu->src[src_idx].neg = bytes[bytes_read++];
287	alu->src[src_idx].abs = bytes[bytes_read++];
288	alu->src[src_idx].rel = bytes[bytes_read++];
289	alu->src[src_idx].kc_bank = bytes[bytes_read++];
290	for (i = 0; i < 4; i++) {
291		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
292	}
293	return bytes_read;
294}
295
296static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
297				unsigned char * bytes, unsigned bytes_read)
298{
299	unsigned src_idx;
300	struct r600_bytecode_alu alu;
301	unsigned src_const_reg[3];
302	uint32_t word0, word1;
303
304	memset(&alu, 0, sizeof(alu));
305	for(src_idx = 0; src_idx < 3; src_idx++) {
306		unsigned i;
307		src_const_reg[src_idx] = bytes[bytes_read++];
308		for (i = 0; i < 4; i++) {
309			alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
310		}
311	}
312
313	word0 = i32_from_byte_stream(bytes, &bytes_read);
314	word1 = i32_from_byte_stream(bytes, &bytes_read);
315
316	switch(ctx->bc->chip_class) {
317	case R600:
318		r600_bytecode_alu_read(&alu, word0, word1);
319		break;
320	case R700:
321	case EVERGREEN:
322	case CAYMAN:
323		r700_bytecode_alu_read(&alu, word0, word1);
324		break;
325	}
326
327	for(src_idx = 0; src_idx < 3; src_idx++) {
328		if (src_const_reg[src_idx])
329			alu.src[src_idx].sel += 512;
330	}
331
332	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
333	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
334	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
335	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
336		alu.update_pred = 1;
337		alu.dst.write = 0;
338		alu.src[1].sel = V_SQ_ALU_SRC_0;
339		alu.src[1].chan = 0;
340		alu.last = 1;
341	}
342
343	if (alu.execute_mask) {
344		alu.pred_sel = 0;
345		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
346	} else {
347		r600_bytecode_add_alu(ctx->bc, &alu);
348	}
349
350	/* XXX: Handle other KILL instructions */
351	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
352		ctx->shader->uses_kill = 1;
353		/* XXX: This should be enforced in the LLVM backend. */
354		ctx->bc->force_add_cf = 1;
355	}
356	return bytes_read;
357}
358
359static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
360	unsigned pred_inst)
361{
362	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
363	fc_pushlevel(ctx, FC_IF);
364	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
365}
366
367static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
368			struct r600_bytecode_alu *alu, unsigned compare_opcode)
369{
370	unsigned opcode = TGSI_OPCODE_BRK;
371	if (ctx->bc->chip_class == CAYMAN)
372		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
373	else if (ctx->bc->chip_class >= EVERGREEN)
374		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
375	else
376		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
377	llvm_if(ctx, alu, compare_opcode);
378	tgsi_loop_brk_cont(ctx);
379	tgsi_endif(ctx);
380}
381
382static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
383				unsigned char * bytes, unsigned bytes_read)
384{
385	struct r600_bytecode_alu alu;
386	unsigned inst;
387	memset(&alu, 0, sizeof(alu));
388	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
389	inst = bytes[bytes_read++];
390	switch (inst) {
391	case 0: /* FC_IF */
392		llvm_if(ctx, &alu,
393			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
394		break;
395	case 1: /* FC_IF_INT */
396		llvm_if(ctx, &alu,
397			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
398		break;
399	case 2: /* FC_ELSE */
400		tgsi_else(ctx);
401		break;
402	case 3: /* FC_ENDIF */
403		tgsi_endif(ctx);
404		break;
405	case 4: /* FC_BGNLOOP */
406		tgsi_bgnloop(ctx);
407		break;
408	case 5: /* FC_ENDLOOP */
409		tgsi_endloop(ctx);
410		break;
411	case 6: /* FC_BREAK */
412		r600_break_from_byte_stream(ctx, &alu,
413			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
414		break;
415	case 7: /* FC_BREAK_NZ_INT */
416		r600_break_from_byte_stream(ctx, &alu,
417			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
418		break;
419	case 8: /* FC_CONTINUE */
420		{
421			unsigned opcode = TGSI_OPCODE_CONT;
422			if (ctx->bc->chip_class == CAYMAN) {
423				ctx->inst_info =
424					&cm_shader_tgsi_instruction[opcode];
425			} else if (ctx->bc->chip_class >= EVERGREEN) {
426				ctx->inst_info =
427					&eg_shader_tgsi_instruction[opcode];
428			} else {
429				ctx->inst_info =
430					&r600_shader_tgsi_instruction[opcode];
431			}
432			tgsi_loop_brk_cont(ctx);
433		}
434		break;
435	case 9: /* FC_BREAK_Z_INT */
436		r600_break_from_byte_stream(ctx, &alu,
437			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
438		break;
439	case 10: /* FC_BREAK_NZ */
440		r600_break_from_byte_stream(ctx, &alu,
441			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
442		break;
443	}
444
445	return bytes_read;
446}
447
448static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
449				unsigned char * bytes, unsigned bytes_read)
450{
451	struct r600_bytecode_tex tex;
452
453	tex.inst = bytes[bytes_read++];
454	tex.resource_id = bytes[bytes_read++];
455	tex.src_gpr = bytes[bytes_read++];
456	tex.src_rel = bytes[bytes_read++];
457	tex.dst_gpr = bytes[bytes_read++];
458	tex.dst_rel = bytes[bytes_read++];
459	tex.dst_sel_x = bytes[bytes_read++];
460	tex.dst_sel_y = bytes[bytes_read++];
461	tex.dst_sel_z = bytes[bytes_read++];
462	tex.dst_sel_w = bytes[bytes_read++];
463	tex.lod_bias = bytes[bytes_read++];
464	tex.coord_type_x = bytes[bytes_read++];
465	tex.coord_type_y = bytes[bytes_read++];
466	tex.coord_type_z = bytes[bytes_read++];
467	tex.coord_type_w = bytes[bytes_read++];
468	tex.offset_x = bytes[bytes_read++];
469	tex.offset_y = bytes[bytes_read++];
470	tex.offset_z = bytes[bytes_read++];
471	tex.sampler_id = bytes[bytes_read++];
472	tex.src_sel_x = bytes[bytes_read++];
473	tex.src_sel_y = bytes[bytes_read++];
474	tex.src_sel_z = bytes[bytes_read++];
475	tex.src_sel_w = bytes[bytes_read++];
476
477	r600_bytecode_add_tex(ctx->bc, &tex);
478
479	return bytes_read;
480}
481
482static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
483	unsigned char * bytes, unsigned bytes_read)
484{
485	struct r600_bytecode_vtx vtx;
486
487	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
488        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
489	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
490
491	memset(&vtx, 0, sizeof(vtx));
492
493	/* WORD0 */
494	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
495	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
496	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
497	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
498	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
499	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
500
501	/* WORD1 */
502	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
503	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
504	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
505	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
506	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
507	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
508	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
509	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
510	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
511	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
512
513	/* WORD 2*/
514	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
515	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
516
517	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
518		fprintf(stderr, "Error adding vtx\n");
519	}
520	/* Use the Texture Cache */
521	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
522	return bytes_read;
523}
524
525static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
526				unsigned char * bytes,	unsigned num_bytes)
527{
528	unsigned bytes_read = 0;
529	unsigned i, byte;
530	while (bytes_read < num_bytes) {
531		char inst_type = bytes[bytes_read++];
532		switch (inst_type) {
533		case 0:
534			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
535								bytes_read);
536			break;
537		case 1:
538			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
539								bytes_read);
540			break;
541		case 2:
542			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
543								bytes_read);
544			break;
545		case 3:
546			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
547			for (i = 0; i < 2; i++) {
548				for (byte = 0 ; byte < 4; byte++) {
549					ctx->bc->cf_last->isa[i] |=
550					(bytes[bytes_read++] << (byte * 8));
551				}
552			}
553			break;
554
555		case 4:
556			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
557								bytes_read);
558			break;
559		default:
560			/* XXX: Error here */
561			break;
562		}
563	}
564}
565
566/* End bytestream -> r600 shader functions*/
567
568static int tgsi_is_supported(struct r600_shader_ctx *ctx)
569{
570	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
571	int j;
572
573	if (i->Instruction.NumDstRegs > 1) {
574		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
575		return -EINVAL;
576	}
577	if (i->Instruction.Predicate) {
578		R600_ERR("predicate unsupported\n");
579		return -EINVAL;
580	}
581#if 0
582	if (i->Instruction.Label) {
583		R600_ERR("label unsupported\n");
584		return -EINVAL;
585	}
586#endif
587	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
588		if (i->Src[j].Register.Dimension) {
589			R600_ERR("unsupported src %d (dimension %d)\n", j,
590				 i->Src[j].Register.Dimension);
591			return -EINVAL;
592		}
593	}
594	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
595		if (i->Dst[j].Register.Dimension) {
596			R600_ERR("unsupported dst (dimension)\n");
597			return -EINVAL;
598		}
599	}
600	return 0;
601}
602
603static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
604{
605	int i, r;
606	struct r600_bytecode_alu alu;
607	int gpr = 0, base_chan = 0;
608	int ij_index = 0;
609
610	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
611		ij_index = 0;
612		if (ctx->shader->input[input].centroid)
613			ij_index++;
614	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
615		ij_index = 0;
616		/* if we have perspective add one */
617		if (ctx->input_perspective)  {
618			ij_index++;
619			/* if we have perspective centroid */
620			if (ctx->input_centroid)
621				ij_index++;
622		}
623		if (ctx->shader->input[input].centroid)
624			ij_index++;
625	}
626
627	/* work out gpr and base_chan from index */
628	gpr = ij_index / 2;
629	base_chan = (2 * (ij_index % 2)) + 1;
630
631	for (i = 0; i < 8; i++) {
632		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
633
634		if (i < 4)
635			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
636		else
637			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
638
639		if ((i > 1) && (i < 6)) {
640			alu.dst.sel = ctx->shader->input[input].gpr;
641			alu.dst.write = 1;
642		}
643
644		alu.dst.chan = i % 4;
645
646		alu.src[0].sel = gpr;
647		alu.src[0].chan = (base_chan - (i % 2));
648
649		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
650
651		alu.bank_swizzle_force = SQ_ALU_VEC_210;
652		if ((i % 4) == 3)
653			alu.last = 1;
654		r = r600_bytecode_add_alu(ctx->bc, &alu);
655		if (r)
656			return r;
657	}
658	return 0;
659}
660
661static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
662{
663	int i, r;
664	struct r600_bytecode_alu alu;
665
666	for (i = 0; i < 4; i++) {
667		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
668
669		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
670
671		alu.dst.sel = ctx->shader->input[input].gpr;
672		alu.dst.write = 1;
673
674		alu.dst.chan = i;
675
676		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
677		alu.src[0].chan = i;
678
679		if (i == 3)
680			alu.last = 1;
681		r = r600_bytecode_add_alu(ctx->bc, &alu);
682		if (r)
683			return r;
684	}
685	return 0;
686}
687
688/*
689 * Special export handling in shaders
690 *
691 * shader export ARRAY_BASE for EXPORT_POS:
692 * 60 is position
693 * 61 is misc vector
694 * 62, 63 are clip distance vectors
695 *
696 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
697 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
698 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
699 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
700 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
701 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
702 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
703 * exclusive from render target index)
704 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
705 *
706 *
707 * shader export ARRAY_BASE for EXPORT_PIXEL:
708 * 0-7 CB targets
709 * 61 computed Z vector
710 *
711 * The use of the values exported in the computed Z vector are controlled
712 * by DB_SHADER_CONTROL:
713 * Z_EXPORT_ENABLE - Z as a float in RED
714 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
715 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
716 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
717 * DB_SOURCE_FORMAT - export control restrictions
718 *
719 */
720
721
722/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
723static int r600_spi_sid(struct r600_shader_io * io)
724{
725	int index, name = io->name;
726
727	/* These params are handled differently, they don't need
728	 * semantic indices, so we'll use 0 for them.
729	 */
730	if (name == TGSI_SEMANTIC_POSITION ||
731		name == TGSI_SEMANTIC_PSIZE ||
732		name == TGSI_SEMANTIC_FACE)
733		index = 0;
734	else {
735		if (name == TGSI_SEMANTIC_GENERIC) {
736			/* For generic params simply use sid from tgsi */
737			index = io->sid;
738		} else {
739			/* For non-generic params - pack name and sid into 8 bits */
740			index = 0x80 | (name<<3) | (io->sid);
741		}
742
743		/* Make sure that all really used indices have nonzero value, so
744		 * we can just compare it to 0 later instead of comparing the name
745		 * with different values to detect special cases. */
746		index++;
747	}
748
749	return index;
750};
751
752/* turn input into interpolate on EG */
753static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
754{
755	int r = 0;
756
757	if (ctx->shader->input[index].spi_sid) {
758		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
759		if (ctx->shader->input[index].interpolate > 0) {
760			r = evergreen_interp_alu(ctx, index);
761		} else {
762			r = evergreen_interp_flat(ctx, index);
763		}
764	}
765	return r;
766}
767
768static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
769{
770	struct r600_bytecode_alu alu;
771	int i, r;
772	int gpr_front = ctx->shader->input[front].gpr;
773	int gpr_back = ctx->shader->input[back].gpr;
774
775	for (i = 0; i < 4; i++) {
776		memset(&alu, 0, sizeof(alu));
777		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
778		alu.is_op3 = 1;
779		alu.dst.write = 1;
780		alu.dst.sel = gpr_front;
781		alu.src[0].sel = ctx->face_gpr;
782		alu.src[1].sel = gpr_front;
783		alu.src[2].sel = gpr_back;
784
785		alu.dst.chan = i;
786		alu.src[1].chan = i;
787		alu.src[2].chan = i;
788		alu.last = (i==3);
789
790		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
791			return r;
792	}
793
794	return 0;
795}
796
797static int tgsi_declaration(struct r600_shader_ctx *ctx)
798{
799	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
800	unsigned i;
801	int r;
802
803	switch (d->Declaration.File) {
804	case TGSI_FILE_INPUT:
805		i = ctx->shader->ninput++;
806		ctx->shader->input[i].name = d->Semantic.Name;
807		ctx->shader->input[i].sid = d->Semantic.Index;
808		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
809		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
810		ctx->shader->input[i].centroid = d->Interp.Centroid;
811		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
812		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
813			switch (ctx->shader->input[i].name) {
814			case TGSI_SEMANTIC_FACE:
815				ctx->face_gpr = ctx->shader->input[i].gpr;
816				break;
817			case TGSI_SEMANTIC_COLOR:
818				ctx->colors_used++;
819				break;
820			case TGSI_SEMANTIC_POSITION:
821				ctx->fragcoord_input = i;
822				break;
823			}
824			if (ctx->bc->chip_class >= EVERGREEN) {
825				if ((r = evergreen_interp_input(ctx, i)))
826					return r;
827			}
828		}
829		break;
830	case TGSI_FILE_OUTPUT:
831		i = ctx->shader->noutput++;
832		ctx->shader->output[i].name = d->Semantic.Name;
833		ctx->shader->output[i].sid = d->Semantic.Index;
834		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
835		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
836		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
837		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
838		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
839			switch (d->Semantic.Name) {
840			case TGSI_SEMANTIC_CLIPDIST:
841				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
842				break;
843			case TGSI_SEMANTIC_PSIZE:
844				ctx->shader->vs_out_misc_write = 1;
845				ctx->shader->vs_out_point_size = 1;
846				break;
847			case TGSI_SEMANTIC_CLIPVERTEX:
848				ctx->clip_vertex_write = TRUE;
849				ctx->cv_output = i;
850				break;
851			}
852		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
853			switch (d->Semantic.Name) {
854			case TGSI_SEMANTIC_COLOR:
855				ctx->shader->nr_ps_max_color_exports++;
856				break;
857			}
858		}
859		break;
860	case TGSI_FILE_CONSTANT:
861	case TGSI_FILE_TEMPORARY:
862	case TGSI_FILE_SAMPLER:
863	case TGSI_FILE_ADDRESS:
864		break;
865
866	case TGSI_FILE_SYSTEM_VALUE:
867		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
868			if (!ctx->native_integers) {
869				struct r600_bytecode_alu alu;
870				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
871
872				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
873				alu.src[0].sel = 0;
874				alu.src[0].chan = 3;
875
876				alu.dst.sel = 0;
877				alu.dst.chan = 3;
878				alu.dst.write = 1;
879				alu.last = 1;
880
881				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
882					return r;
883			}
884			break;
885		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
886			break;
887	default:
888		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
889		return -EINVAL;
890	}
891	return 0;
892}
893
894static int r600_get_temp(struct r600_shader_ctx *ctx)
895{
896	return ctx->temp_reg + ctx->max_driver_temp_used++;
897}
898
899/*
900 * for evergreen we need to scan the shader to find the number of GPRs we need to
901 * reserve for interpolation.
902 *
903 * we need to know if we are going to emit
904 * any centroid inputs
905 * if perspective and linear are required
906*/
907static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
908{
909	int i;
910	int num_baryc;
911
912	ctx->input_linear = FALSE;
913	ctx->input_perspective = FALSE;
914	ctx->input_centroid = FALSE;
915	ctx->num_interp_gpr = 1;
916
917	/* any centroid inputs */
918	for (i = 0; i < ctx->info.num_inputs; i++) {
919		/* skip position/face */
920		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
921		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
922			continue;
923		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
924			ctx->input_linear = TRUE;
925		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
926			ctx->input_perspective = TRUE;
927		if (ctx->info.input_centroid[i])
928			ctx->input_centroid = TRUE;
929	}
930
931	num_baryc = 0;
932	/* ignoring sample for now */
933	if (ctx->input_perspective)
934		num_baryc++;
935	if (ctx->input_linear)
936		num_baryc++;
937	if (ctx->input_centroid)
938		num_baryc *= 2;
939
940	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
941
942	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
943	return ctx->num_interp_gpr;
944}
945
946static void tgsi_src(struct r600_shader_ctx *ctx,
947		     const struct tgsi_full_src_register *tgsi_src,
948		     struct r600_shader_src *r600_src)
949{
950	memset(r600_src, 0, sizeof(*r600_src));
951	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
952	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
953	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
954	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
955	r600_src->neg = tgsi_src->Register.Negate;
956	r600_src->abs = tgsi_src->Register.Absolute;
957
958	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
959		int index;
960		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
961			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
962			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
963
964			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
965			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
966			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
967				return;
968		}
969		index = tgsi_src->Register.Index;
970		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
971		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
972	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
973		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
974			r600_src->swizzle[0] = 3;
975			r600_src->swizzle[1] = 3;
976			r600_src->swizzle[2] = 3;
977			r600_src->swizzle[3] = 3;
978			r600_src->sel = 0;
979		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
980			r600_src->swizzle[0] = 0;
981			r600_src->swizzle[1] = 0;
982			r600_src->swizzle[2] = 0;
983			r600_src->swizzle[3] = 0;
984			r600_src->sel = 0;
985		}
986	} else {
987		if (tgsi_src->Register.Indirect)
988			r600_src->rel = V_SQ_REL_RELATIVE;
989		r600_src->sel = tgsi_src->Register.Index;
990		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
991	}
992}
993
994static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
995{
996	struct r600_bytecode_vtx vtx;
997	unsigned int ar_reg;
998	int r;
999
1000	if (offset) {
1001		struct r600_bytecode_alu alu;
1002
1003		memset(&alu, 0, sizeof(alu));
1004
1005		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
1006		alu.src[0].sel = ctx->bc->ar_reg;
1007
1008		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1009		alu.src[1].value = offset;
1010
1011		alu.dst.sel = dst_reg;
1012		alu.dst.write = 1;
1013		alu.last = 1;
1014
1015		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1016			return r;
1017
1018		ar_reg = dst_reg;
1019	} else {
1020		ar_reg = ctx->bc->ar_reg;
1021	}
1022
1023	memset(&vtx, 0, sizeof(vtx));
1024	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1025	vtx.src_gpr = ar_reg;
1026	vtx.mega_fetch_count = 16;
1027	vtx.dst_gpr = dst_reg;
1028	vtx.dst_sel_x = 0;		/* SEL_X */
1029	vtx.dst_sel_y = 1;		/* SEL_Y */
1030	vtx.dst_sel_z = 2;		/* SEL_Z */
1031	vtx.dst_sel_w = 3;		/* SEL_W */
1032	vtx.data_format = FMT_32_32_32_32_FLOAT;
1033	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1034	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1035	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1036	vtx.endian = r600_endian_swap(32);
1037
1038	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1039		return r;
1040
1041	return 0;
1042}
1043
1044static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1045{
1046	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1047	struct r600_bytecode_alu alu;
1048	int i, j, k, nconst, r;
1049
1050	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1051		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1052			nconst++;
1053		}
1054		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1055	}
1056	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1057		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1058			continue;
1059		}
1060
1061		if (ctx->src[i].rel) {
1062			int treg = r600_get_temp(ctx);
1063			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1064				return r;
1065
1066			ctx->src[i].sel = treg;
1067			ctx->src[i].rel = 0;
1068			j--;
1069		} else if (j > 0) {
1070			int treg = r600_get_temp(ctx);
1071			for (k = 0; k < 4; k++) {
1072				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1073				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1074				alu.src[0].sel = ctx->src[i].sel;
1075				alu.src[0].chan = k;
1076				alu.src[0].rel = ctx->src[i].rel;
1077				alu.dst.sel = treg;
1078				alu.dst.chan = k;
1079				alu.dst.write = 1;
1080				if (k == 3)
1081					alu.last = 1;
1082				r = r600_bytecode_add_alu(ctx->bc, &alu);
1083				if (r)
1084					return r;
1085			}
1086			ctx->src[i].sel = treg;
1087			ctx->src[i].rel =0;
1088			j--;
1089		}
1090	}
1091	return 0;
1092}
1093
1094/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1095static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1096{
1097	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1098	struct r600_bytecode_alu alu;
1099	int i, j, k, nliteral, r;
1100
1101	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1102		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1103			nliteral++;
1104		}
1105	}
1106	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1107		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1108			int treg = r600_get_temp(ctx);
1109			for (k = 0; k < 4; k++) {
1110				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1111				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1112				alu.src[0].sel = ctx->src[i].sel;
1113				alu.src[0].chan = k;
1114				alu.src[0].value = ctx->src[i].value[k];
1115				alu.dst.sel = treg;
1116				alu.dst.chan = k;
1117				alu.dst.write = 1;
1118				if (k == 3)
1119					alu.last = 1;
1120				r = r600_bytecode_add_alu(ctx->bc, &alu);
1121				if (r)
1122					return r;
1123			}
1124			ctx->src[i].sel = treg;
1125			j--;
1126		}
1127	}
1128	return 0;
1129}
1130
1131static int process_twoside_color_inputs(struct r600_shader_ctx *ctx, unsigned use_llvm)
1132{
1133	int i, r, count = ctx->shader->ninput;
1134
1135	for (i = 0; i < count; i++) {
1136		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1137			unsigned back_facing_reg = ctx->shader->input[i].potential_back_facing_reg;
1138			if (ctx->bc->chip_class >= EVERGREEN) {
1139				if ((r = evergreen_interp_input(ctx, back_facing_reg)))
1140					return r;
1141			}
1142
1143			if (!use_llvm) {
1144				r = select_twoside_color(ctx, i, back_facing_reg);
1145				if (r)
1146					return r;
1147			}
1148		}
1149	}
1150	return 0;
1151}
1152
1153static int r600_shader_from_tgsi(struct r600_screen *rscreen,
1154				 struct r600_pipe_shader *pipeshader,
1155				 struct r600_shader_key key)
1156{
1157	struct r600_shader *shader = &pipeshader->shader;
1158	struct tgsi_token *tokens = pipeshader->selector->tokens;
1159	struct pipe_stream_output_info so = pipeshader->selector->so;
1160	struct tgsi_full_immediate *immediate;
1161	struct tgsi_full_property *property;
1162	struct r600_shader_ctx ctx;
1163	struct r600_bytecode_output output[32];
1164	unsigned output_done, noutput;
1165	unsigned opcode;
1166	int i, j, k, r = 0;
1167	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1168	/* Declarations used by llvm code */
1169	bool use_llvm = false;
1170	unsigned char * inst_bytes = NULL;
1171	unsigned inst_byte_count = 0;
1172
1173#ifdef R600_USE_LLVM
1174	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1175#endif
1176	ctx.bc = &shader->bc;
1177	ctx.shader = shader;
1178	ctx.native_integers = true;
1179
1180	r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family);
1181	ctx.tokens = tokens;
1182	tgsi_scan_shader(tokens, &ctx.info);
1183	tgsi_parse_init(&ctx.parse, tokens);
1184	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1185	shader->processor_type = ctx.type;
1186	ctx.bc->type = shader->processor_type;
1187
1188	ctx.face_gpr = -1;
1189	ctx.fragcoord_input = -1;
1190	ctx.colors_used = 0;
1191	ctx.clip_vertex_write = 0;
1192
1193	shader->nr_ps_color_exports = 0;
1194	shader->nr_ps_max_color_exports = 0;
1195
1196	shader->two_side = key.color_two_side;
1197
1198	/* register allocations */
1199	/* Values [0,127] correspond to GPR[0..127].
1200	 * Values [128,159] correspond to constant buffer bank 0
1201	 * Values [160,191] correspond to constant buffer bank 1
1202	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1203	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1204	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1205	 * Other special values are shown in the list below.
1206	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1207	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1208	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1209	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1210	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1211	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1212	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1213	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1214	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1215	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1216	 * 254	SQ_ALU_SRC_PV: previous vector result.
1217	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1218	 */
1219	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1220		ctx.file_offset[i] = 0;
1221	}
1222	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1223		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1224		if (ctx.bc->chip_class >= EVERGREEN) {
1225			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1226		} else {
1227			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1228		}
1229	}
1230	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1231		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1232	}
1233
1234#ifdef R600_USE_LLVM
1235	if (use_llvm && ctx.info.indirect_files) {
1236		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1237				"indirect adressing.  Falling back to TGSI "
1238				"backend.\n");
1239		use_llvm = 0;
1240	}
1241#endif
1242
1243	if (use_llvm) {
1244		ctx.file_offset[TGSI_FILE_OUTPUT] =
1245			ctx.file_offset[TGSI_FILE_INPUT];
1246	} else {
1247	   ctx.file_offset[TGSI_FILE_OUTPUT] =
1248			ctx.file_offset[TGSI_FILE_INPUT] +
1249			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1250	}
1251	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1252						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1253
1254	/* Outside the GPR range. This will be translated to one of the
1255	 * kcache banks later. */
1256	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1257
1258	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1259	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1260			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1261	ctx.temp_reg = ctx.bc->ar_reg + 1;
1262
1263	ctx.nliterals = 0;
1264	ctx.literals = NULL;
1265	shader->fs_write_all = FALSE;
1266	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1267		tgsi_parse_token(&ctx.parse);
1268		switch (ctx.parse.FullToken.Token.Type) {
1269		case TGSI_TOKEN_TYPE_IMMEDIATE:
1270			immediate = &ctx.parse.FullToken.FullImmediate;
1271			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1272			if(ctx.literals == NULL) {
1273				r = -ENOMEM;
1274				goto out_err;
1275			}
1276			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1277			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1278			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1279			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1280			ctx.nliterals++;
1281			break;
1282		case TGSI_TOKEN_TYPE_DECLARATION:
1283			r = tgsi_declaration(&ctx);
1284			if (r)
1285				goto out_err;
1286			break;
1287		case TGSI_TOKEN_TYPE_INSTRUCTION:
1288			break;
1289		case TGSI_TOKEN_TYPE_PROPERTY:
1290			property = &ctx.parse.FullToken.FullProperty;
1291			switch (property->Property.PropertyName) {
1292			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1293				if (property->u[0].Data == 1)
1294					shader->fs_write_all = TRUE;
1295				break;
1296			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1297				/* we don't need this one */
1298				break;
1299			}
1300			break;
1301		default:
1302			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1303			r = -EINVAL;
1304			goto out_err;
1305		}
1306	}
1307
1308	/* Process two side if needed */
1309	if (shader->two_side && ctx.colors_used) {
1310		int i, count = ctx.shader->ninput;
1311
1312		/* additional inputs will be allocated right after the existing inputs,
1313		 * we won't need them after the color selection, so we don't need to
1314		 * reserve these gprs for the rest of the shader code and to adjust
1315		 * output offsets etc. */
1316		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1317				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1318
1319		if (ctx.face_gpr == -1) {
1320			i = ctx.shader->ninput++;
1321			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1322			ctx.shader->input[i].spi_sid = 0;
1323			ctx.shader->input[i].gpr = gpr++;
1324			ctx.face_gpr = ctx.shader->input[i].gpr;
1325		}
1326
1327		for (i = 0; i < count; i++) {
1328			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1329				int ni = ctx.shader->ninput++;
1330				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1331				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1332				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1333				ctx.shader->input[ni].gpr = gpr++;
1334				ctx.shader->input[i].potential_back_facing_reg = ni;
1335			}
1336		}
1337	}
1338
1339/* LLVM backend setup */
1340#ifdef R600_USE_LLVM
1341	if (use_llvm) {
1342		struct radeon_llvm_context radeon_llvm_ctx;
1343		LLVMModuleRef mod;
1344		unsigned dump = 0;
1345		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1346		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1347		radeon_llvm_ctx.type = ctx.type;
1348		radeon_llvm_ctx.two_side = shader->two_side;
1349		radeon_llvm_ctx.face_input = ctx.face_gpr;
1350		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1351		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1352		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1353		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1354			dump = 1;
1355		}
1356		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1357							rscreen->family, dump)) {
1358			FREE(inst_bytes);
1359			radeon_llvm_dispose(&radeon_llvm_ctx);
1360			use_llvm = 0;
1361			fprintf(stderr, "R600 LLVM backend failed to compile "
1362				"shader.  Falling back to TGSI\n");
1363		} else {
1364			ctx.file_offset[TGSI_FILE_OUTPUT] =
1365					ctx.file_offset[TGSI_FILE_INPUT];
1366		}
1367		radeon_llvm_dispose(&radeon_llvm_ctx);
1368	}
1369#endif
1370/* End of LLVM backend setup */
1371
1372	if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN)
1373		shader->nr_ps_max_color_exports = 8;
1374
1375	if (ctx.fragcoord_input >= 0) {
1376		if (ctx.bc->chip_class == CAYMAN) {
1377			for (j = 0 ; j < 4; j++) {
1378				struct r600_bytecode_alu alu;
1379				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1380				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1381				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1382				alu.src[0].chan = 3;
1383
1384				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1385				alu.dst.chan = j;
1386				alu.dst.write = (j == 3);
1387				alu.last = 1;
1388				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1389					return r;
1390			}
1391		} else {
1392			struct r600_bytecode_alu alu;
1393			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1394			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1395			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1396			alu.src[0].chan = 3;
1397
1398			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1399			alu.dst.chan = 3;
1400			alu.dst.write = 1;
1401			alu.last = 1;
1402			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1403				return r;
1404		}
1405	}
1406
1407	if (shader->two_side && ctx.colors_used) {
1408		if ((r = process_twoside_color_inputs(&ctx, use_llvm)))
1409			return r;
1410	}
1411
1412	tgsi_parse_init(&ctx.parse, tokens);
1413	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1414		tgsi_parse_token(&ctx.parse);
1415		switch (ctx.parse.FullToken.Token.Type) {
1416		case TGSI_TOKEN_TYPE_INSTRUCTION:
1417			if (use_llvm) {
1418				continue;
1419			}
1420			r = tgsi_is_supported(&ctx);
1421			if (r)
1422				goto out_err;
1423			ctx.max_driver_temp_used = 0;
1424			/* reserve first tmp for everyone */
1425			r600_get_temp(&ctx);
1426
1427			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1428			if ((r = tgsi_split_constant(&ctx)))
1429				goto out_err;
1430			if ((r = tgsi_split_literal_constant(&ctx)))
1431				goto out_err;
1432			if (ctx.bc->chip_class == CAYMAN)
1433				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1434			else if (ctx.bc->chip_class >= EVERGREEN)
1435				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1436			else
1437				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1438			r = ctx.inst_info->process(&ctx);
1439			if (r)
1440				goto out_err;
1441			break;
1442		default:
1443			break;
1444		}
1445	}
1446
1447	/* Get instructions if we are using the LLVM backend. */
1448	if (use_llvm) {
1449		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1450		FREE(inst_bytes);
1451	}
1452
1453	noutput = shader->noutput;
1454
1455	if (ctx.clip_vertex_write) {
1456		/* need to convert a clipvertex write into clipdistance writes and not export
1457		   the clip vertex anymore */
1458
1459		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1460		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1461		shader->output[noutput].gpr = ctx.temp_reg;
1462		noutput++;
1463		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1464		shader->output[noutput].gpr = ctx.temp_reg+1;
1465		noutput++;
1466
1467		/* reset spi_sid for clipvertex output to avoid confusing spi */
1468		shader->output[ctx.cv_output].spi_sid = 0;
1469
1470		shader->clip_dist_write = 0xFF;
1471
1472		for (i = 0; i < 8; i++) {
1473			int oreg = i >> 2;
1474			int ochan = i & 3;
1475
1476			for (j = 0; j < 4; j++) {
1477				struct r600_bytecode_alu alu;
1478				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1479				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1480				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1481				alu.src[0].chan = j;
1482
1483				alu.src[1].sel = 512 + i;
1484				alu.src[1].kc_bank = 1;
1485				alu.src[1].chan = j;
1486
1487				alu.dst.sel = ctx.temp_reg + oreg;
1488				alu.dst.chan = j;
1489				alu.dst.write = (j == ochan);
1490				if (j == 3)
1491					alu.last = 1;
1492				r = r600_bytecode_add_alu(ctx.bc, &alu);
1493				if (r)
1494					return r;
1495			}
1496		}
1497	}
1498
1499	/* Add stream outputs. */
1500	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1501		for (i = 0; i < so.num_outputs; i++) {
1502			struct r600_bytecode_output output;
1503
1504			if (so.output[i].output_buffer >= 4) {
1505				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1506					 so.output[i].output_buffer);
1507				r = -EINVAL;
1508				goto out_err;
1509			}
1510			if (so.output[i].dst_offset < so.output[i].start_component) {
1511			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1512			   r = -EINVAL;
1513			   goto out_err;
1514			}
1515
1516			memset(&output, 0, sizeof(struct r600_bytecode_output));
1517			output.gpr = shader->output[so.output[i].register_index].gpr;
1518			output.elem_size = 0;
1519			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1520			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1521			output.burst_count = 1;
1522			output.barrier = 1;
1523			/* array_size is an upper limit for the burst_count
1524			 * with MEM_STREAM instructions */
1525			output.array_size = 0xFFF;
1526			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1527			if (ctx.bc->chip_class >= EVERGREEN) {
1528				switch (so.output[i].output_buffer) {
1529				case 0:
1530					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1531					break;
1532				case 1:
1533					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1534					break;
1535				case 2:
1536					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1537					break;
1538				case 3:
1539					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1540					break;
1541				}
1542			} else {
1543				switch (so.output[i].output_buffer) {
1544				case 0:
1545					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1546					break;
1547				case 1:
1548					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1549					break;
1550				case 2:
1551					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1552					break;
1553				case 3:
1554					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1555					break;
1556				}
1557			}
1558			r = r600_bytecode_add_output(ctx.bc, &output);
1559			if (r)
1560				goto out_err;
1561		}
1562	}
1563
1564	/* export output */
1565	for (i = 0, j = 0; i < noutput; i++, j++) {
1566		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1567		output[j].gpr = shader->output[i].gpr;
1568		output[j].elem_size = 3;
1569		output[j].swizzle_x = 0;
1570		output[j].swizzle_y = 1;
1571		output[j].swizzle_z = 2;
1572		output[j].swizzle_w = 3;
1573		output[j].burst_count = 1;
1574		output[j].barrier = 1;
1575		output[j].type = -1;
1576		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1577		switch (ctx.type) {
1578		case TGSI_PROCESSOR_VERTEX:
1579			switch (shader->output[i].name) {
1580			case TGSI_SEMANTIC_POSITION:
1581				output[j].array_base = next_pos_base++;
1582				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1583				break;
1584
1585			case TGSI_SEMANTIC_PSIZE:
1586				output[j].array_base = next_pos_base++;
1587				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1588				break;
1589			case TGSI_SEMANTIC_CLIPVERTEX:
1590				j--;
1591				break;
1592			case TGSI_SEMANTIC_CLIPDIST:
1593				output[j].array_base = next_pos_base++;
1594				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1595				/* spi_sid is 0 for clipdistance outputs that were generated
1596				 * for clipvertex - we don't need to pass them to PS */
1597				if (shader->output[i].spi_sid) {
1598					j++;
1599					/* duplicate it as PARAM to pass to the pixel shader */
1600					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1601					output[j].array_base = next_param_base++;
1602					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1603				}
1604				break;
1605			case TGSI_SEMANTIC_FOG:
1606				output[j].swizzle_y = 4; /* 0 */
1607				output[j].swizzle_z = 4; /* 0 */
1608				output[j].swizzle_w = 5; /* 1 */
1609				break;
1610			}
1611			break;
1612		case TGSI_PROCESSOR_FRAGMENT:
1613			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1614				/* never export more colors than the number of CBs */
1615				if (next_pixel_base && next_pixel_base >= key.nr_cbufs) {
1616					/* skip export */
1617					j--;
1618					continue;
1619				}
1620				output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1621				output[j].array_base = next_pixel_base++;
1622				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1623				shader->nr_ps_color_exports++;
1624				if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
1625					for (k = 1; k < key.nr_cbufs; k++) {
1626						j++;
1627						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1628						output[j].gpr = shader->output[i].gpr;
1629						output[j].elem_size = 3;
1630						output[j].swizzle_x = 0;
1631						output[j].swizzle_y = 1;
1632						output[j].swizzle_z = 2;
1633						output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1634						output[j].burst_count = 1;
1635						output[j].barrier = 1;
1636						output[j].array_base = next_pixel_base++;
1637						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1638						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1639						shader->nr_ps_color_exports++;
1640					}
1641				}
1642			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1643				output[j].array_base = 61;
1644				output[j].swizzle_x = 2;
1645				output[j].swizzle_y = 7;
1646				output[j].swizzle_z = output[j].swizzle_w = 7;
1647				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1648			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1649				output[j].array_base = 61;
1650				output[j].swizzle_x = 7;
1651				output[j].swizzle_y = 1;
1652				output[j].swizzle_z = output[j].swizzle_w = 7;
1653				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1654			} else {
1655				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1656				r = -EINVAL;
1657				goto out_err;
1658			}
1659			break;
1660		default:
1661			R600_ERR("unsupported processor type %d\n", ctx.type);
1662			r = -EINVAL;
1663			goto out_err;
1664		}
1665
1666		if (output[j].type==-1) {
1667			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1668			output[j].array_base = next_param_base++;
1669		}
1670	}
1671
1672	/* add fake param output for vertex shader if no param is exported */
1673	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1674			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1675			output[j].gpr = 0;
1676			output[j].elem_size = 3;
1677			output[j].swizzle_x = 7;
1678			output[j].swizzle_y = 7;
1679			output[j].swizzle_z = 7;
1680			output[j].swizzle_w = 7;
1681			output[j].burst_count = 1;
1682			output[j].barrier = 1;
1683			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1684			output[j].array_base = 0;
1685			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1686			j++;
1687	}
1688
1689	/* add fake pixel export */
1690	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1691		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1692		output[j].gpr = 0;
1693		output[j].elem_size = 3;
1694		output[j].swizzle_x = 7;
1695		output[j].swizzle_y = 7;
1696		output[j].swizzle_z = 7;
1697		output[j].swizzle_w = 7;
1698		output[j].burst_count = 1;
1699		output[j].barrier = 1;
1700		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1701		output[j].array_base = 0;
1702		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1703		j++;
1704	}
1705
1706	noutput = j;
1707
1708	/* set export done on last export of each type */
1709	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1710		if (ctx.bc->chip_class < CAYMAN) {
1711			if (i == (noutput - 1)) {
1712				output[i].end_of_program = 1;
1713			}
1714		}
1715		if (!(output_done & (1 << output[i].type))) {
1716			output_done |= (1 << output[i].type);
1717			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1718		}
1719	}
1720	/* add output to bytecode */
1721	for (i = 0; i < noutput; i++) {
1722		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1723		if (r)
1724			goto out_err;
1725	}
1726	/* add program end */
1727	if (ctx.bc->chip_class == CAYMAN)
1728		cm_bytecode_add_cf_end(ctx.bc);
1729
1730	/* check GPR limit - we have 124 = 128 - 4
1731	 * (4 are reserved as alu clause temporary registers) */
1732	if (ctx.bc->ngpr > 124) {
1733		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1734		r = -ENOMEM;
1735		goto out_err;
1736	}
1737
1738	free(ctx.literals);
1739	tgsi_parse_free(&ctx.parse);
1740	return 0;
1741out_err:
1742	free(ctx.literals);
1743	tgsi_parse_free(&ctx.parse);
1744	return r;
1745}
1746
1747static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1748{
1749	R600_ERR("%s tgsi opcode unsupported\n",
1750		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1751	return -EINVAL;
1752}
1753
1754static int tgsi_end(struct r600_shader_ctx *ctx)
1755{
1756	return 0;
1757}
1758
1759static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1760			const struct r600_shader_src *shader_src,
1761			unsigned chan)
1762{
1763	bc_src->sel = shader_src->sel;
1764	bc_src->chan = shader_src->swizzle[chan];
1765	bc_src->neg = shader_src->neg;
1766	bc_src->abs = shader_src->abs;
1767	bc_src->rel = shader_src->rel;
1768	bc_src->value = shader_src->value[bc_src->chan];
1769}
1770
1771static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1772{
1773	bc_src->abs = 1;
1774	bc_src->neg = 0;
1775}
1776
1777static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1778{
1779	bc_src->neg = !bc_src->neg;
1780}
1781
1782static void tgsi_dst(struct r600_shader_ctx *ctx,
1783		     const struct tgsi_full_dst_register *tgsi_dst,
1784		     unsigned swizzle,
1785		     struct r600_bytecode_alu_dst *r600_dst)
1786{
1787	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1788
1789	r600_dst->sel = tgsi_dst->Register.Index;
1790	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1791	r600_dst->chan = swizzle;
1792	r600_dst->write = 1;
1793	if (tgsi_dst->Register.Indirect)
1794		r600_dst->rel = V_SQ_REL_RELATIVE;
1795	if (inst->Instruction.Saturate) {
1796		r600_dst->clamp = 1;
1797	}
1798}
1799
1800static int tgsi_last_instruction(unsigned writemask)
1801{
1802	int i, lasti = 0;
1803
1804	for (i = 0; i < 4; i++) {
1805		if (writemask & (1 << i)) {
1806			lasti = i;
1807		}
1808	}
1809	return lasti;
1810}
1811
1812static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1813{
1814	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1815	struct r600_bytecode_alu alu;
1816	int i, j, r;
1817	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1818
1819	for (i = 0; i < lasti + 1; i++) {
1820		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1821			continue;
1822
1823		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1824		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1825
1826		alu.inst = ctx->inst_info->r600_opcode;
1827		if (!swap) {
1828			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1829				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1830			}
1831		} else {
1832			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1833			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1834		}
1835		/* handle some special cases */
1836		switch (ctx->inst_info->tgsi_opcode) {
1837		case TGSI_OPCODE_SUB:
1838			r600_bytecode_src_toggle_neg(&alu.src[1]);
1839			break;
1840		case TGSI_OPCODE_ABS:
1841			r600_bytecode_src_set_abs(&alu.src[0]);
1842			break;
1843		default:
1844			break;
1845		}
1846		if (i == lasti || trans_only) {
1847			alu.last = 1;
1848		}
1849		r = r600_bytecode_add_alu(ctx->bc, &alu);
1850		if (r)
1851			return r;
1852	}
1853	return 0;
1854}
1855
1856static int tgsi_op2(struct r600_shader_ctx *ctx)
1857{
1858	return tgsi_op2_s(ctx, 0, 0);
1859}
1860
1861static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1862{
1863	return tgsi_op2_s(ctx, 1, 0);
1864}
1865
1866static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1867{
1868	return tgsi_op2_s(ctx, 0, 1);
1869}
1870
1871static int tgsi_ineg(struct r600_shader_ctx *ctx)
1872{
1873	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1874	struct r600_bytecode_alu alu;
1875	int i, r;
1876	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1877
1878	for (i = 0; i < lasti + 1; i++) {
1879
1880		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1881			continue;
1882		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1883		alu.inst = ctx->inst_info->r600_opcode;
1884
1885		alu.src[0].sel = V_SQ_ALU_SRC_0;
1886
1887		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1888
1889		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1890
1891		if (i == lasti) {
1892			alu.last = 1;
1893		}
1894		r = r600_bytecode_add_alu(ctx->bc, &alu);
1895		if (r)
1896			return r;
1897	}
1898	return 0;
1899
1900}
1901
1902static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1903{
1904	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1905	int i, j, r;
1906	struct r600_bytecode_alu alu;
1907	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1908
1909	for (i = 0 ; i < last_slot; i++) {
1910		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1911		alu.inst = ctx->inst_info->r600_opcode;
1912		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1913			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1914
1915			/* RSQ should take the absolute value of src */
1916			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
1917				r600_bytecode_src_set_abs(&alu.src[j]);
1918			}
1919		}
1920		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1921		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1922
1923		if (i == last_slot - 1)
1924			alu.last = 1;
1925		r = r600_bytecode_add_alu(ctx->bc, &alu);
1926		if (r)
1927			return r;
1928	}
1929	return 0;
1930}
1931
1932static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1933{
1934	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1935	int i, j, k, r;
1936	struct r600_bytecode_alu alu;
1937	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1938	for (k = 0; k < last_slot; k++) {
1939		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1940			continue;
1941
1942		for (i = 0 ; i < 4; i++) {
1943			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1944			alu.inst = ctx->inst_info->r600_opcode;
1945			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1946				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1947			}
1948			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1949			alu.dst.write = (i == k);
1950			if (i == 3)
1951				alu.last = 1;
1952			r = r600_bytecode_add_alu(ctx->bc, &alu);
1953			if (r)
1954				return r;
1955		}
1956	}
1957	return 0;
1958}
1959
1960/*
1961 * r600 - trunc to -PI..PI range
1962 * r700 - normalize by dividing by 2PI
1963 * see fdo bug 27901
1964 */
1965static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1966{
1967	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1968	static float double_pi = 3.1415926535 * 2;
1969	static float neg_pi = -3.1415926535;
1970
1971	int r;
1972	struct r600_bytecode_alu alu;
1973
1974	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1975	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1976	alu.is_op3 = 1;
1977
1978	alu.dst.chan = 0;
1979	alu.dst.sel = ctx->temp_reg;
1980	alu.dst.write = 1;
1981
1982	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1983
1984	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1985	alu.src[1].chan = 0;
1986	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1987	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1988	alu.src[2].chan = 0;
1989	alu.last = 1;
1990	r = r600_bytecode_add_alu(ctx->bc, &alu);
1991	if (r)
1992		return r;
1993
1994	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1995	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1996
1997	alu.dst.chan = 0;
1998	alu.dst.sel = ctx->temp_reg;
1999	alu.dst.write = 1;
2000
2001	alu.src[0].sel = ctx->temp_reg;
2002	alu.src[0].chan = 0;
2003	alu.last = 1;
2004	r = r600_bytecode_add_alu(ctx->bc, &alu);
2005	if (r)
2006		return r;
2007
2008	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2009	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2010	alu.is_op3 = 1;
2011
2012	alu.dst.chan = 0;
2013	alu.dst.sel = ctx->temp_reg;
2014	alu.dst.write = 1;
2015
2016	alu.src[0].sel = ctx->temp_reg;
2017	alu.src[0].chan = 0;
2018
2019	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2020	alu.src[1].chan = 0;
2021	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2022	alu.src[2].chan = 0;
2023
2024	if (ctx->bc->chip_class == R600) {
2025		alu.src[1].value = *(uint32_t *)&double_pi;
2026		alu.src[2].value = *(uint32_t *)&neg_pi;
2027	} else {
2028		alu.src[1].sel = V_SQ_ALU_SRC_1;
2029		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2030		alu.src[2].neg = 1;
2031	}
2032
2033	alu.last = 1;
2034	r = r600_bytecode_add_alu(ctx->bc, &alu);
2035	if (r)
2036		return r;
2037	return 0;
2038}
2039
2040static int cayman_trig(struct r600_shader_ctx *ctx)
2041{
2042	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2043	struct r600_bytecode_alu alu;
2044	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2045	int i, r;
2046
2047	r = tgsi_setup_trig(ctx);
2048	if (r)
2049		return r;
2050
2051
2052	for (i = 0; i < last_slot; i++) {
2053		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2054		alu.inst = ctx->inst_info->r600_opcode;
2055		alu.dst.chan = i;
2056
2057		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2058		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2059
2060		alu.src[0].sel = ctx->temp_reg;
2061		alu.src[0].chan = 0;
2062		if (i == last_slot - 1)
2063			alu.last = 1;
2064		r = r600_bytecode_add_alu(ctx->bc, &alu);
2065		if (r)
2066			return r;
2067	}
2068	return 0;
2069}
2070
2071static int tgsi_trig(struct r600_shader_ctx *ctx)
2072{
2073	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2074	struct r600_bytecode_alu alu;
2075	int i, r;
2076	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2077
2078	r = tgsi_setup_trig(ctx);
2079	if (r)
2080		return r;
2081
2082	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2083	alu.inst = ctx->inst_info->r600_opcode;
2084	alu.dst.chan = 0;
2085	alu.dst.sel = ctx->temp_reg;
2086	alu.dst.write = 1;
2087
2088	alu.src[0].sel = ctx->temp_reg;
2089	alu.src[0].chan = 0;
2090	alu.last = 1;
2091	r = r600_bytecode_add_alu(ctx->bc, &alu);
2092	if (r)
2093		return r;
2094
2095	/* replicate result */
2096	for (i = 0; i < lasti + 1; i++) {
2097		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2098			continue;
2099
2100		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2101		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2102
2103		alu.src[0].sel = ctx->temp_reg;
2104		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2105		if (i == lasti)
2106			alu.last = 1;
2107		r = r600_bytecode_add_alu(ctx->bc, &alu);
2108		if (r)
2109			return r;
2110	}
2111	return 0;
2112}
2113
2114static int tgsi_scs(struct r600_shader_ctx *ctx)
2115{
2116	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2117	struct r600_bytecode_alu alu;
2118	int i, r;
2119
2120	/* We'll only need the trig stuff if we are going to write to the
2121	 * X or Y components of the destination vector.
2122	 */
2123	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2124		r = tgsi_setup_trig(ctx);
2125		if (r)
2126			return r;
2127	}
2128
2129	/* dst.x = COS */
2130	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2131		if (ctx->bc->chip_class == CAYMAN) {
2132			for (i = 0 ; i < 3; i++) {
2133				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2134				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2135				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2136
2137				if (i == 0)
2138					alu.dst.write = 1;
2139				else
2140					alu.dst.write = 0;
2141				alu.src[0].sel = ctx->temp_reg;
2142				alu.src[0].chan = 0;
2143				if (i == 2)
2144					alu.last = 1;
2145				r = r600_bytecode_add_alu(ctx->bc, &alu);
2146				if (r)
2147					return r;
2148			}
2149		} else {
2150			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2151			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2152			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2153
2154			alu.src[0].sel = ctx->temp_reg;
2155			alu.src[0].chan = 0;
2156			alu.last = 1;
2157			r = r600_bytecode_add_alu(ctx->bc, &alu);
2158			if (r)
2159				return r;
2160		}
2161	}
2162
2163	/* dst.y = SIN */
2164	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2165		if (ctx->bc->chip_class == CAYMAN) {
2166			for (i = 0 ; i < 3; i++) {
2167				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2168				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2169				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2170				if (i == 1)
2171					alu.dst.write = 1;
2172				else
2173					alu.dst.write = 0;
2174				alu.src[0].sel = ctx->temp_reg;
2175				alu.src[0].chan = 0;
2176				if (i == 2)
2177					alu.last = 1;
2178				r = r600_bytecode_add_alu(ctx->bc, &alu);
2179				if (r)
2180					return r;
2181			}
2182		} else {
2183			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2184			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2185			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2186
2187			alu.src[0].sel = ctx->temp_reg;
2188			alu.src[0].chan = 0;
2189			alu.last = 1;
2190			r = r600_bytecode_add_alu(ctx->bc, &alu);
2191			if (r)
2192				return r;
2193		}
2194	}
2195
2196	/* dst.z = 0.0; */
2197	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2198		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2199
2200		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2201
2202		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2203
2204		alu.src[0].sel = V_SQ_ALU_SRC_0;
2205		alu.src[0].chan = 0;
2206
2207		alu.last = 1;
2208
2209		r = r600_bytecode_add_alu(ctx->bc, &alu);
2210		if (r)
2211			return r;
2212	}
2213
2214	/* dst.w = 1.0; */
2215	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2216		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2217
2218		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2219
2220		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2221
2222		alu.src[0].sel = V_SQ_ALU_SRC_1;
2223		alu.src[0].chan = 0;
2224
2225		alu.last = 1;
2226
2227		r = r600_bytecode_add_alu(ctx->bc, &alu);
2228		if (r)
2229			return r;
2230	}
2231
2232	return 0;
2233}
2234
2235static int tgsi_kill(struct r600_shader_ctx *ctx)
2236{
2237	struct r600_bytecode_alu alu;
2238	int i, r;
2239
2240	for (i = 0; i < 4; i++) {
2241		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2242		alu.inst = ctx->inst_info->r600_opcode;
2243
2244		alu.dst.chan = i;
2245
2246		alu.src[0].sel = V_SQ_ALU_SRC_0;
2247
2248		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2249			alu.src[1].sel = V_SQ_ALU_SRC_1;
2250			alu.src[1].neg = 1;
2251		} else {
2252			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2253		}
2254		if (i == 3) {
2255			alu.last = 1;
2256		}
2257		r = r600_bytecode_add_alu(ctx->bc, &alu);
2258		if (r)
2259			return r;
2260	}
2261
2262	/* kill must be last in ALU */
2263	ctx->bc->force_add_cf = 1;
2264	ctx->shader->uses_kill = TRUE;
2265	return 0;
2266}
2267
2268static int tgsi_lit(struct r600_shader_ctx *ctx)
2269{
2270	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2271	struct r600_bytecode_alu alu;
2272	int r;
2273
2274	/* tmp.x = max(src.y, 0.0) */
2275	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2276	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2277	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2278	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2279	alu.src[1].chan = 1;
2280
2281	alu.dst.sel = ctx->temp_reg;
2282	alu.dst.chan = 0;
2283	alu.dst.write = 1;
2284
2285	alu.last = 1;
2286	r = r600_bytecode_add_alu(ctx->bc, &alu);
2287	if (r)
2288		return r;
2289
2290	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2291	{
2292		int chan;
2293		int sel;
2294		int i;
2295
2296		if (ctx->bc->chip_class == CAYMAN) {
2297			for (i = 0; i < 3; i++) {
2298				/* tmp.z = log(tmp.x) */
2299				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2300				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2301				alu.src[0].sel = ctx->temp_reg;
2302				alu.src[0].chan = 0;
2303				alu.dst.sel = ctx->temp_reg;
2304				alu.dst.chan = i;
2305				if (i == 2) {
2306					alu.dst.write = 1;
2307					alu.last = 1;
2308				} else
2309					alu.dst.write = 0;
2310
2311				r = r600_bytecode_add_alu(ctx->bc, &alu);
2312				if (r)
2313					return r;
2314			}
2315		} else {
2316			/* tmp.z = log(tmp.x) */
2317			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2318			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2319			alu.src[0].sel = ctx->temp_reg;
2320			alu.src[0].chan = 0;
2321			alu.dst.sel = ctx->temp_reg;
2322			alu.dst.chan = 2;
2323			alu.dst.write = 1;
2324			alu.last = 1;
2325			r = r600_bytecode_add_alu(ctx->bc, &alu);
2326			if (r)
2327				return r;
2328		}
2329
2330		chan = alu.dst.chan;
2331		sel = alu.dst.sel;
2332
2333		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2334		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2335		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2336		alu.src[0].sel  = sel;
2337		alu.src[0].chan = chan;
2338		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2339		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2340		alu.dst.sel = ctx->temp_reg;
2341		alu.dst.chan = 0;
2342		alu.dst.write = 1;
2343		alu.is_op3 = 1;
2344		alu.last = 1;
2345		r = r600_bytecode_add_alu(ctx->bc, &alu);
2346		if (r)
2347			return r;
2348
2349		if (ctx->bc->chip_class == CAYMAN) {
2350			for (i = 0; i < 3; i++) {
2351				/* dst.z = exp(tmp.x) */
2352				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2353				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2354				alu.src[0].sel = ctx->temp_reg;
2355				alu.src[0].chan = 0;
2356				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2357				if (i == 2) {
2358					alu.dst.write = 1;
2359					alu.last = 1;
2360				} else
2361					alu.dst.write = 0;
2362				r = r600_bytecode_add_alu(ctx->bc, &alu);
2363				if (r)
2364					return r;
2365			}
2366		} else {
2367			/* dst.z = exp(tmp.x) */
2368			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2369			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2370			alu.src[0].sel = ctx->temp_reg;
2371			alu.src[0].chan = 0;
2372			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2373			alu.last = 1;
2374			r = r600_bytecode_add_alu(ctx->bc, &alu);
2375			if (r)
2376				return r;
2377		}
2378	}
2379
2380	/* dst.x, <- 1.0  */
2381	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2382	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2383	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2384	alu.src[0].chan = 0;
2385	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2386	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2387	r = r600_bytecode_add_alu(ctx->bc, &alu);
2388	if (r)
2389		return r;
2390
2391	/* dst.y = max(src.x, 0.0) */
2392	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2393	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2394	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2395	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2396	alu.src[1].chan = 0;
2397	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2398	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2399	r = r600_bytecode_add_alu(ctx->bc, &alu);
2400	if (r)
2401		return r;
2402
2403	/* dst.w, <- 1.0  */
2404	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2405	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2406	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2407	alu.src[0].chan = 0;
2408	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2409	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2410	alu.last = 1;
2411	r = r600_bytecode_add_alu(ctx->bc, &alu);
2412	if (r)
2413		return r;
2414
2415	return 0;
2416}
2417
2418static int tgsi_rsq(struct r600_shader_ctx *ctx)
2419{
2420	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2421	struct r600_bytecode_alu alu;
2422	int i, r;
2423
2424	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2425
2426	/* XXX:
2427	 * For state trackers other than OpenGL, we'll want to use
2428	 * _RECIPSQRT_IEEE instead.
2429	 */
2430	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2431
2432	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2433		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2434		r600_bytecode_src_set_abs(&alu.src[i]);
2435	}
2436	alu.dst.sel = ctx->temp_reg;
2437	alu.dst.write = 1;
2438	alu.last = 1;
2439	r = r600_bytecode_add_alu(ctx->bc, &alu);
2440	if (r)
2441		return r;
2442	/* replicate result */
2443	return tgsi_helper_tempx_replicate(ctx);
2444}
2445
2446static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2447{
2448	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2449	struct r600_bytecode_alu alu;
2450	int i, r;
2451
2452	for (i = 0; i < 4; i++) {
2453		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2454		alu.src[0].sel = ctx->temp_reg;
2455		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2456		alu.dst.chan = i;
2457		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2458		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2459		if (i == 3)
2460			alu.last = 1;
2461		r = r600_bytecode_add_alu(ctx->bc, &alu);
2462		if (r)
2463			return r;
2464	}
2465	return 0;
2466}
2467
2468static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2469{
2470	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2471	struct r600_bytecode_alu alu;
2472	int i, r;
2473
2474	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2475	alu.inst = ctx->inst_info->r600_opcode;
2476	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2477		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2478	}
2479	alu.dst.sel = ctx->temp_reg;
2480	alu.dst.write = 1;
2481	alu.last = 1;
2482	r = r600_bytecode_add_alu(ctx->bc, &alu);
2483	if (r)
2484		return r;
2485	/* replicate result */
2486	return tgsi_helper_tempx_replicate(ctx);
2487}
2488
2489static int cayman_pow(struct r600_shader_ctx *ctx)
2490{
2491	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2492	int i, r;
2493	struct r600_bytecode_alu alu;
2494	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2495
2496	for (i = 0; i < 3; i++) {
2497		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2498		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2499		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2500		alu.dst.sel = ctx->temp_reg;
2501		alu.dst.chan = i;
2502		alu.dst.write = 1;
2503		if (i == 2)
2504			alu.last = 1;
2505		r = r600_bytecode_add_alu(ctx->bc, &alu);
2506		if (r)
2507			return r;
2508	}
2509
2510	/* b * LOG2(a) */
2511	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2512	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2513	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2514	alu.src[1].sel = ctx->temp_reg;
2515	alu.dst.sel = ctx->temp_reg;
2516	alu.dst.write = 1;
2517	alu.last = 1;
2518	r = r600_bytecode_add_alu(ctx->bc, &alu);
2519	if (r)
2520		return r;
2521
2522	for (i = 0; i < last_slot; i++) {
2523		/* POW(a,b) = EXP2(b * LOG2(a))*/
2524		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2525		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2526		alu.src[0].sel = ctx->temp_reg;
2527
2528		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2529		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2530		if (i == last_slot - 1)
2531			alu.last = 1;
2532		r = r600_bytecode_add_alu(ctx->bc, &alu);
2533		if (r)
2534			return r;
2535	}
2536	return 0;
2537}
2538
2539static int tgsi_pow(struct r600_shader_ctx *ctx)
2540{
2541	struct r600_bytecode_alu alu;
2542	int r;
2543
2544	/* LOG2(a) */
2545	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2546	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2547	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2548	alu.dst.sel = ctx->temp_reg;
2549	alu.dst.write = 1;
2550	alu.last = 1;
2551	r = r600_bytecode_add_alu(ctx->bc, &alu);
2552	if (r)
2553		return r;
2554	/* b * LOG2(a) */
2555	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2556	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2557	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2558	alu.src[1].sel = ctx->temp_reg;
2559	alu.dst.sel = ctx->temp_reg;
2560	alu.dst.write = 1;
2561	alu.last = 1;
2562	r = r600_bytecode_add_alu(ctx->bc, &alu);
2563	if (r)
2564		return r;
2565	/* POW(a,b) = EXP2(b * LOG2(a))*/
2566	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2567	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2568	alu.src[0].sel = ctx->temp_reg;
2569	alu.dst.sel = ctx->temp_reg;
2570	alu.dst.write = 1;
2571	alu.last = 1;
2572	r = r600_bytecode_add_alu(ctx->bc, &alu);
2573	if (r)
2574		return r;
2575	return tgsi_helper_tempx_replicate(ctx);
2576}
2577
2578static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2579{
2580	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2581	struct r600_bytecode_alu alu;
2582	int i, r, j;
2583	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2584	int tmp0 = ctx->temp_reg;
2585	int tmp1 = r600_get_temp(ctx);
2586	int tmp2 = r600_get_temp(ctx);
2587	int tmp3 = r600_get_temp(ctx);
2588	/* Unsigned path:
2589	 *
2590	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2591	 *
2592	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2593	 * 2. tmp0.z = lo (tmp0.x * src2)
2594	 * 3. tmp0.w = -tmp0.z
2595	 * 4. tmp0.y = hi (tmp0.x * src2)
2596	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2597	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2598	 * 7. tmp1.x = tmp0.x - tmp0.w
2599	 * 8. tmp1.y = tmp0.x + tmp0.w
2600	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2601	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2602	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2603	 *
2604	 * 12. tmp0.w = src1 - tmp0.y       = r
2605	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2606	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2607	 *
2608	 * if DIV
2609	 *
2610	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2611	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2612	 *
2613	 * else MOD
2614	 *
2615	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2616	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2617	 *
2618	 * endif
2619	 *
2620	 * 17. tmp1.x = tmp1.x & tmp1.y
2621	 *
2622	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2623	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2624	 *
2625	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2626	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2627	 *
2628	 * Signed path:
2629	 *
2630	 * Same as unsigned, using abs values of the operands,
2631	 * and fixing the sign of the result in the end.
2632	 */
2633
2634	for (i = 0; i < 4; i++) {
2635		if (!(write_mask & (1<<i)))
2636			continue;
2637
2638		if (signed_op) {
2639
2640			/* tmp2.x = -src0 */
2641			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2642			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2643
2644			alu.dst.sel = tmp2;
2645			alu.dst.chan = 0;
2646			alu.dst.write = 1;
2647
2648			alu.src[0].sel = V_SQ_ALU_SRC_0;
2649
2650			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2651
2652			alu.last = 1;
2653			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2654				return r;
2655
2656			/* tmp2.y = -src1 */
2657			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2658			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2659
2660			alu.dst.sel = tmp2;
2661			alu.dst.chan = 1;
2662			alu.dst.write = 1;
2663
2664			alu.src[0].sel = V_SQ_ALU_SRC_0;
2665
2666			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2667
2668			alu.last = 1;
2669			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2670				return r;
2671
2672			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2673			/* it will be a sign of the quotient */
2674			if (!mod) {
2675
2676				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2677				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2678
2679				alu.dst.sel = tmp2;
2680				alu.dst.chan = 2;
2681				alu.dst.write = 1;
2682
2683				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2684				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2685
2686				alu.last = 1;
2687				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2688					return r;
2689			}
2690
2691			/* tmp2.x = |src0| */
2692			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2693			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2694			alu.is_op3 = 1;
2695
2696			alu.dst.sel = tmp2;
2697			alu.dst.chan = 0;
2698			alu.dst.write = 1;
2699
2700			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2701			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2702			alu.src[2].sel = tmp2;
2703			alu.src[2].chan = 0;
2704
2705			alu.last = 1;
2706			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2707				return r;
2708
2709			/* tmp2.y = |src1| */
2710			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2711			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2712			alu.is_op3 = 1;
2713
2714			alu.dst.sel = tmp2;
2715			alu.dst.chan = 1;
2716			alu.dst.write = 1;
2717
2718			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2719			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2720			alu.src[2].sel = tmp2;
2721			alu.src[2].chan = 1;
2722
2723			alu.last = 1;
2724			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2725				return r;
2726
2727		}
2728
2729		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2730		if (ctx->bc->chip_class == CAYMAN) {
2731			/* tmp3.x = u2f(src2) */
2732			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2733			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2734
2735			alu.dst.sel = tmp3;
2736			alu.dst.chan = 0;
2737			alu.dst.write = 1;
2738
2739			if (signed_op) {
2740				alu.src[0].sel = tmp2;
2741				alu.src[0].chan = 1;
2742			} else {
2743				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2744			}
2745
2746			alu.last = 1;
2747			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2748				return r;
2749
2750			/* tmp0.x = recip(tmp3.x) */
2751			for (j = 0 ; j < 3; j++) {
2752				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2753				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2754
2755				alu.dst.sel = tmp0;
2756				alu.dst.chan = j;
2757				alu.dst.write = (j == 0);
2758
2759				alu.src[0].sel = tmp3;
2760				alu.src[0].chan = 0;
2761
2762				if (j == 2)
2763					alu.last = 1;
2764				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2765					return r;
2766			}
2767
2768			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2769			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2770
2771			alu.src[0].sel = tmp0;
2772			alu.src[0].chan = 0;
2773
2774			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2775			alu.src[1].value = 0x4f800000;
2776
2777			alu.dst.sel = tmp3;
2778			alu.dst.write = 1;
2779			alu.last = 1;
2780			r = r600_bytecode_add_alu(ctx->bc, &alu);
2781			if (r)
2782				return r;
2783
2784			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2785			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2786
2787			alu.dst.sel = tmp0;
2788			alu.dst.chan = 0;
2789			alu.dst.write = 1;
2790
2791			alu.src[0].sel = tmp3;
2792			alu.src[0].chan = 0;
2793
2794			alu.last = 1;
2795			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2796				return r;
2797
2798		} else {
2799			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2800			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2801
2802			alu.dst.sel = tmp0;
2803			alu.dst.chan = 0;
2804			alu.dst.write = 1;
2805
2806			if (signed_op) {
2807				alu.src[0].sel = tmp2;
2808				alu.src[0].chan = 1;
2809			} else {
2810				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2811			}
2812
2813			alu.last = 1;
2814			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2815				return r;
2816		}
2817
2818		/* 2. tmp0.z = lo (tmp0.x * src2) */
2819		if (ctx->bc->chip_class == CAYMAN) {
2820			for (j = 0 ; j < 4; j++) {
2821				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2822				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2823
2824				alu.dst.sel = tmp0;
2825				alu.dst.chan = j;
2826				alu.dst.write = (j == 2);
2827
2828				alu.src[0].sel = tmp0;
2829				alu.src[0].chan = 0;
2830				if (signed_op) {
2831					alu.src[1].sel = tmp2;
2832					alu.src[1].chan = 1;
2833				} else {
2834					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2835				}
2836
2837				alu.last = (j == 3);
2838				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2839					return r;
2840			}
2841		} else {
2842			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2843			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2844
2845			alu.dst.sel = tmp0;
2846			alu.dst.chan = 2;
2847			alu.dst.write = 1;
2848
2849			alu.src[0].sel = tmp0;
2850			alu.src[0].chan = 0;
2851			if (signed_op) {
2852				alu.src[1].sel = tmp2;
2853				alu.src[1].chan = 1;
2854			} else {
2855				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2856			}
2857
2858			alu.last = 1;
2859			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2860				return r;
2861		}
2862
2863		/* 3. tmp0.w = -tmp0.z */
2864		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2865		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2866
2867		alu.dst.sel = tmp0;
2868		alu.dst.chan = 3;
2869		alu.dst.write = 1;
2870
2871		alu.src[0].sel = V_SQ_ALU_SRC_0;
2872		alu.src[1].sel = tmp0;
2873		alu.src[1].chan = 2;
2874
2875		alu.last = 1;
2876		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2877			return r;
2878
2879		/* 4. tmp0.y = hi (tmp0.x * src2) */
2880		if (ctx->bc->chip_class == CAYMAN) {
2881			for (j = 0 ; j < 4; j++) {
2882				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2883				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2884
2885				alu.dst.sel = tmp0;
2886				alu.dst.chan = j;
2887				alu.dst.write = (j == 1);
2888
2889				alu.src[0].sel = tmp0;
2890				alu.src[0].chan = 0;
2891
2892				if (signed_op) {
2893					alu.src[1].sel = tmp2;
2894					alu.src[1].chan = 1;
2895				} else {
2896					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2897				}
2898				alu.last = (j == 3);
2899				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2900					return r;
2901			}
2902		} else {
2903			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2904			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2905
2906			alu.dst.sel = tmp0;
2907			alu.dst.chan = 1;
2908			alu.dst.write = 1;
2909
2910			alu.src[0].sel = tmp0;
2911			alu.src[0].chan = 0;
2912
2913			if (signed_op) {
2914				alu.src[1].sel = tmp2;
2915				alu.src[1].chan = 1;
2916			} else {
2917				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2918			}
2919
2920			alu.last = 1;
2921			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2922				return r;
2923		}
2924
2925		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2926		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2927		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2928		alu.is_op3 = 1;
2929
2930		alu.dst.sel = tmp0;
2931		alu.dst.chan = 2;
2932		alu.dst.write = 1;
2933
2934		alu.src[0].sel = tmp0;
2935		alu.src[0].chan = 1;
2936		alu.src[1].sel = tmp0;
2937		alu.src[1].chan = 3;
2938		alu.src[2].sel = tmp0;
2939		alu.src[2].chan = 2;
2940
2941		alu.last = 1;
2942		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2943			return r;
2944
2945		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2946		if (ctx->bc->chip_class == CAYMAN) {
2947			for (j = 0 ; j < 4; j++) {
2948				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2949				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2950
2951				alu.dst.sel = tmp0;
2952				alu.dst.chan = j;
2953				alu.dst.write = (j == 3);
2954
2955				alu.src[0].sel = tmp0;
2956				alu.src[0].chan = 2;
2957
2958				alu.src[1].sel = tmp0;
2959				alu.src[1].chan = 0;
2960
2961				alu.last = (j == 3);
2962				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2963					return r;
2964			}
2965		} else {
2966			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2967			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2968
2969			alu.dst.sel = tmp0;
2970			alu.dst.chan = 3;
2971			alu.dst.write = 1;
2972
2973			alu.src[0].sel = tmp0;
2974			alu.src[0].chan = 2;
2975
2976			alu.src[1].sel = tmp0;
2977			alu.src[1].chan = 0;
2978
2979			alu.last = 1;
2980			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2981				return r;
2982		}
2983
2984		/* 7. tmp1.x = tmp0.x - tmp0.w */
2985		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2986		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2987
2988		alu.dst.sel = tmp1;
2989		alu.dst.chan = 0;
2990		alu.dst.write = 1;
2991
2992		alu.src[0].sel = tmp0;
2993		alu.src[0].chan = 0;
2994		alu.src[1].sel = tmp0;
2995		alu.src[1].chan = 3;
2996
2997		alu.last = 1;
2998		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2999			return r;
3000
3001		/* 8. tmp1.y = tmp0.x + tmp0.w */
3002		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3003		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3004
3005		alu.dst.sel = tmp1;
3006		alu.dst.chan = 1;
3007		alu.dst.write = 1;
3008
3009		alu.src[0].sel = tmp0;
3010		alu.src[0].chan = 0;
3011		alu.src[1].sel = tmp0;
3012		alu.src[1].chan = 3;
3013
3014		alu.last = 1;
3015		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3016			return r;
3017
3018		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3019		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3020		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3021		alu.is_op3 = 1;
3022
3023		alu.dst.sel = tmp0;
3024		alu.dst.chan = 0;
3025		alu.dst.write = 1;
3026
3027		alu.src[0].sel = tmp0;
3028		alu.src[0].chan = 1;
3029		alu.src[1].sel = tmp1;
3030		alu.src[1].chan = 1;
3031		alu.src[2].sel = tmp1;
3032		alu.src[2].chan = 0;
3033
3034		alu.last = 1;
3035		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3036			return r;
3037
3038		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3039		if (ctx->bc->chip_class == CAYMAN) {
3040			for (j = 0 ; j < 4; j++) {
3041				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3042				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3043
3044				alu.dst.sel = tmp0;
3045				alu.dst.chan = j;
3046				alu.dst.write = (j == 2);
3047
3048				alu.src[0].sel = tmp0;
3049				alu.src[0].chan = 0;
3050
3051				if (signed_op) {
3052					alu.src[1].sel = tmp2;
3053					alu.src[1].chan = 0;
3054				} else {
3055					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3056				}
3057
3058				alu.last = (j == 3);
3059				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3060					return r;
3061			}
3062		} else {
3063			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3064			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3065
3066			alu.dst.sel = tmp0;
3067			alu.dst.chan = 2;
3068			alu.dst.write = 1;
3069
3070			alu.src[0].sel = tmp0;
3071			alu.src[0].chan = 0;
3072
3073			if (signed_op) {
3074				alu.src[1].sel = tmp2;
3075				alu.src[1].chan = 0;
3076			} else {
3077				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3078			}
3079
3080			alu.last = 1;
3081			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3082				return r;
3083		}
3084
3085		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3086		if (ctx->bc->chip_class == CAYMAN) {
3087			for (j = 0 ; j < 4; j++) {
3088				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3089				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3090
3091				alu.dst.sel = tmp0;
3092				alu.dst.chan = j;
3093				alu.dst.write = (j == 1);
3094
3095				if (signed_op) {
3096					alu.src[0].sel = tmp2;
3097					alu.src[0].chan = 1;
3098				} else {
3099					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3100				}
3101
3102				alu.src[1].sel = tmp0;
3103				alu.src[1].chan = 2;
3104
3105				alu.last = (j == 3);
3106				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3107					return r;
3108			}
3109		} else {
3110			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3111			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3112
3113			alu.dst.sel = tmp0;
3114			alu.dst.chan = 1;
3115			alu.dst.write = 1;
3116
3117			if (signed_op) {
3118				alu.src[0].sel = tmp2;
3119				alu.src[0].chan = 1;
3120			} else {
3121				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3122			}
3123
3124			alu.src[1].sel = tmp0;
3125			alu.src[1].chan = 2;
3126
3127			alu.last = 1;
3128			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3129				return r;
3130		}
3131
3132		/* 12. tmp0.w = src1 - tmp0.y       = r */
3133		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3134		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3135
3136		alu.dst.sel = tmp0;
3137		alu.dst.chan = 3;
3138		alu.dst.write = 1;
3139
3140		if (signed_op) {
3141			alu.src[0].sel = tmp2;
3142			alu.src[0].chan = 0;
3143		} else {
3144			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3145		}
3146
3147		alu.src[1].sel = tmp0;
3148		alu.src[1].chan = 1;
3149
3150		alu.last = 1;
3151		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3152			return r;
3153
3154		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3155		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3156		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3157
3158		alu.dst.sel = tmp1;
3159		alu.dst.chan = 0;
3160		alu.dst.write = 1;
3161
3162		alu.src[0].sel = tmp0;
3163		alu.src[0].chan = 3;
3164		if (signed_op) {
3165			alu.src[1].sel = tmp2;
3166			alu.src[1].chan = 1;
3167		} else {
3168			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3169		}
3170
3171		alu.last = 1;
3172		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3173			return r;
3174
3175		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3176		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3177		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3178
3179		alu.dst.sel = tmp1;
3180		alu.dst.chan = 1;
3181		alu.dst.write = 1;
3182
3183		if (signed_op) {
3184			alu.src[0].sel = tmp2;
3185			alu.src[0].chan = 0;
3186		} else {
3187			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3188		}
3189
3190		alu.src[1].sel = tmp0;
3191		alu.src[1].chan = 1;
3192
3193		alu.last = 1;
3194		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3195			return r;
3196
3197		if (mod) { /* UMOD */
3198
3199			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3200			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3201			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3202
3203			alu.dst.sel = tmp1;
3204			alu.dst.chan = 2;
3205			alu.dst.write = 1;
3206
3207			alu.src[0].sel = tmp0;
3208			alu.src[0].chan = 3;
3209
3210			if (signed_op) {
3211				alu.src[1].sel = tmp2;
3212				alu.src[1].chan = 1;
3213			} else {
3214				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3215			}
3216
3217			alu.last = 1;
3218			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3219				return r;
3220
3221			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3222			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3223			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3224
3225			alu.dst.sel = tmp1;
3226			alu.dst.chan = 3;
3227			alu.dst.write = 1;
3228
3229			alu.src[0].sel = tmp0;
3230			alu.src[0].chan = 3;
3231			if (signed_op) {
3232				alu.src[1].sel = tmp2;
3233				alu.src[1].chan = 1;
3234			} else {
3235				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3236			}
3237
3238			alu.last = 1;
3239			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3240				return r;
3241
3242		} else { /* UDIV */
3243
3244			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3245			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3246			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3247
3248			alu.dst.sel = tmp1;
3249			alu.dst.chan = 2;
3250			alu.dst.write = 1;
3251
3252			alu.src[0].sel = tmp0;
3253			alu.src[0].chan = 2;
3254			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3255
3256			alu.last = 1;
3257			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3258				return r;
3259
3260			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3261			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3262			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3263
3264			alu.dst.sel = tmp1;
3265			alu.dst.chan = 3;
3266			alu.dst.write = 1;
3267
3268			alu.src[0].sel = tmp0;
3269			alu.src[0].chan = 2;
3270			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3271
3272			alu.last = 1;
3273			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3274				return r;
3275
3276		}
3277
3278		/* 17. tmp1.x = tmp1.x & tmp1.y */
3279		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3280		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3281
3282		alu.dst.sel = tmp1;
3283		alu.dst.chan = 0;
3284		alu.dst.write = 1;
3285
3286		alu.src[0].sel = tmp1;
3287		alu.src[0].chan = 0;
3288		alu.src[1].sel = tmp1;
3289		alu.src[1].chan = 1;
3290
3291		alu.last = 1;
3292		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3293			return r;
3294
3295		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3296		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3297		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3298		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3299		alu.is_op3 = 1;
3300
3301		alu.dst.sel = tmp0;
3302		alu.dst.chan = 2;
3303		alu.dst.write = 1;
3304
3305		alu.src[0].sel = tmp1;
3306		alu.src[0].chan = 0;
3307		alu.src[1].sel = tmp0;
3308		alu.src[1].chan = mod ? 3 : 2;
3309		alu.src[2].sel = tmp1;
3310		alu.src[2].chan = 2;
3311
3312		alu.last = 1;
3313		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3314			return r;
3315
3316		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3317		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3318		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3319		alu.is_op3 = 1;
3320
3321		if (signed_op) {
3322			alu.dst.sel = tmp0;
3323			alu.dst.chan = 2;
3324			alu.dst.write = 1;
3325		} else {
3326			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3327		}
3328
3329		alu.src[0].sel = tmp1;
3330		alu.src[0].chan = 1;
3331		alu.src[1].sel = tmp1;
3332		alu.src[1].chan = 3;
3333		alu.src[2].sel = tmp0;
3334		alu.src[2].chan = 2;
3335
3336		alu.last = 1;
3337		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3338			return r;
3339
3340		if (signed_op) {
3341
3342			/* fix the sign of the result */
3343
3344			if (mod) {
3345
3346				/* tmp0.x = -tmp0.z */
3347				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3348				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3349
3350				alu.dst.sel = tmp0;
3351				alu.dst.chan = 0;
3352				alu.dst.write = 1;
3353
3354				alu.src[0].sel = V_SQ_ALU_SRC_0;
3355				alu.src[1].sel = tmp0;
3356				alu.src[1].chan = 2;
3357
3358				alu.last = 1;
3359				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3360					return r;
3361
3362				/* sign of the remainder is the same as the sign of src0 */
3363				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3364				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3365				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3366				alu.is_op3 = 1;
3367
3368				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3369
3370				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3371				alu.src[1].sel = tmp0;
3372				alu.src[1].chan = 2;
3373				alu.src[2].sel = tmp0;
3374				alu.src[2].chan = 0;
3375
3376				alu.last = 1;
3377				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3378					return r;
3379
3380			} else {
3381
3382				/* tmp0.x = -tmp0.z */
3383				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3384				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3385
3386				alu.dst.sel = tmp0;
3387				alu.dst.chan = 0;
3388				alu.dst.write = 1;
3389
3390				alu.src[0].sel = V_SQ_ALU_SRC_0;
3391				alu.src[1].sel = tmp0;
3392				alu.src[1].chan = 2;
3393
3394				alu.last = 1;
3395				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3396					return r;
3397
3398				/* fix the quotient sign (same as the sign of src0*src1) */
3399				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3400				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3401				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3402				alu.is_op3 = 1;
3403
3404				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3405
3406				alu.src[0].sel = tmp2;
3407				alu.src[0].chan = 2;
3408				alu.src[1].sel = tmp0;
3409				alu.src[1].chan = 2;
3410				alu.src[2].sel = tmp0;
3411				alu.src[2].chan = 0;
3412
3413				alu.last = 1;
3414				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3415					return r;
3416			}
3417		}
3418	}
3419	return 0;
3420}
3421
3422static int tgsi_udiv(struct r600_shader_ctx *ctx)
3423{
3424	return tgsi_divmod(ctx, 0, 0);
3425}
3426
3427static int tgsi_umod(struct r600_shader_ctx *ctx)
3428{
3429	return tgsi_divmod(ctx, 1, 0);
3430}
3431
3432static int tgsi_idiv(struct r600_shader_ctx *ctx)
3433{
3434	return tgsi_divmod(ctx, 0, 1);
3435}
3436
3437static int tgsi_imod(struct r600_shader_ctx *ctx)
3438{
3439	return tgsi_divmod(ctx, 1, 1);
3440}
3441
3442
3443static int tgsi_f2i(struct r600_shader_ctx *ctx)
3444{
3445	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3446	struct r600_bytecode_alu alu;
3447	int i, r;
3448	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3449	int last_inst = tgsi_last_instruction(write_mask);
3450
3451	for (i = 0; i < 4; i++) {
3452		if (!(write_mask & (1<<i)))
3453			continue;
3454
3455		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3456		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3457
3458		alu.dst.sel = ctx->temp_reg;
3459		alu.dst.chan = i;
3460		alu.dst.write = 1;
3461
3462		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3463		if (i == last_inst)
3464			alu.last = 1;
3465		r = r600_bytecode_add_alu(ctx->bc, &alu);
3466		if (r)
3467			return r;
3468	}
3469
3470	for (i = 0; i < 4; i++) {
3471		if (!(write_mask & (1<<i)))
3472			continue;
3473
3474		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3475		alu.inst = ctx->inst_info->r600_opcode;
3476
3477		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3478
3479		alu.src[0].sel = ctx->temp_reg;
3480		alu.src[0].chan = i;
3481
3482		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3483			alu.last = 1;
3484		r = r600_bytecode_add_alu(ctx->bc, &alu);
3485		if (r)
3486			return r;
3487	}
3488
3489	return 0;
3490}
3491
3492static int tgsi_iabs(struct r600_shader_ctx *ctx)
3493{
3494	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3495	struct r600_bytecode_alu alu;
3496	int i, r;
3497	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3498	int last_inst = tgsi_last_instruction(write_mask);
3499
3500	/* tmp = -src */
3501	for (i = 0; i < 4; i++) {
3502		if (!(write_mask & (1<<i)))
3503			continue;
3504
3505		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3506		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3507
3508		alu.dst.sel = ctx->temp_reg;
3509		alu.dst.chan = i;
3510		alu.dst.write = 1;
3511
3512		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3513		alu.src[0].sel = V_SQ_ALU_SRC_0;
3514
3515		if (i == last_inst)
3516			alu.last = 1;
3517		r = r600_bytecode_add_alu(ctx->bc, &alu);
3518		if (r)
3519			return r;
3520	}
3521
3522	/* dst = (src >= 0 ? src : tmp) */
3523	for (i = 0; i < 4; i++) {
3524		if (!(write_mask & (1<<i)))
3525			continue;
3526
3527		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3528		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3529		alu.is_op3 = 1;
3530		alu.dst.write = 1;
3531
3532		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3533
3534		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3535		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3536		alu.src[2].sel = ctx->temp_reg;
3537		alu.src[2].chan = i;
3538
3539		if (i == last_inst)
3540			alu.last = 1;
3541		r = r600_bytecode_add_alu(ctx->bc, &alu);
3542		if (r)
3543			return r;
3544	}
3545	return 0;
3546}
3547
3548static int tgsi_issg(struct r600_shader_ctx *ctx)
3549{
3550	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3551	struct r600_bytecode_alu alu;
3552	int i, r;
3553	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3554	int last_inst = tgsi_last_instruction(write_mask);
3555
3556	/* tmp = (src >= 0 ? src : -1) */
3557	for (i = 0; i < 4; i++) {
3558		if (!(write_mask & (1<<i)))
3559			continue;
3560
3561		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3562		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3563		alu.is_op3 = 1;
3564
3565		alu.dst.sel = ctx->temp_reg;
3566		alu.dst.chan = i;
3567		alu.dst.write = 1;
3568
3569		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3570		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3571		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3572
3573		if (i == last_inst)
3574			alu.last = 1;
3575		r = r600_bytecode_add_alu(ctx->bc, &alu);
3576		if (r)
3577			return r;
3578	}
3579
3580	/* dst = (tmp > 0 ? 1 : tmp) */
3581	for (i = 0; i < 4; i++) {
3582		if (!(write_mask & (1<<i)))
3583			continue;
3584
3585		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3586		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3587		alu.is_op3 = 1;
3588		alu.dst.write = 1;
3589
3590		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3591
3592		alu.src[0].sel = ctx->temp_reg;
3593		alu.src[0].chan = i;
3594
3595		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3596
3597		alu.src[2].sel = ctx->temp_reg;
3598		alu.src[2].chan = i;
3599
3600		if (i == last_inst)
3601			alu.last = 1;
3602		r = r600_bytecode_add_alu(ctx->bc, &alu);
3603		if (r)
3604			return r;
3605	}
3606	return 0;
3607}
3608
3609
3610
3611static int tgsi_ssg(struct r600_shader_ctx *ctx)
3612{
3613	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3614	struct r600_bytecode_alu alu;
3615	int i, r;
3616
3617	/* tmp = (src > 0 ? 1 : src) */
3618	for (i = 0; i < 4; i++) {
3619		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3620		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3621		alu.is_op3 = 1;
3622
3623		alu.dst.sel = ctx->temp_reg;
3624		alu.dst.chan = i;
3625
3626		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3627		alu.src[1].sel = V_SQ_ALU_SRC_1;
3628		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3629
3630		if (i == 3)
3631			alu.last = 1;
3632		r = r600_bytecode_add_alu(ctx->bc, &alu);
3633		if (r)
3634			return r;
3635	}
3636
3637	/* dst = (-tmp > 0 ? -1 : tmp) */
3638	for (i = 0; i < 4; i++) {
3639		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3640		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3641		alu.is_op3 = 1;
3642		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3643
3644		alu.src[0].sel = ctx->temp_reg;
3645		alu.src[0].chan = i;
3646		alu.src[0].neg = 1;
3647
3648		alu.src[1].sel = V_SQ_ALU_SRC_1;
3649		alu.src[1].neg = 1;
3650
3651		alu.src[2].sel = ctx->temp_reg;
3652		alu.src[2].chan = i;
3653
3654		if (i == 3)
3655			alu.last = 1;
3656		r = r600_bytecode_add_alu(ctx->bc, &alu);
3657		if (r)
3658			return r;
3659	}
3660	return 0;
3661}
3662
3663static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3664{
3665	struct r600_bytecode_alu alu;
3666	int i, r;
3667
3668	for (i = 0; i < 4; i++) {
3669		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3670		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3671			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3672			alu.dst.chan = i;
3673		} else {
3674			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3675			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3676			alu.src[0].sel = ctx->temp_reg;
3677			alu.src[0].chan = i;
3678		}
3679		if (i == 3) {
3680			alu.last = 1;
3681		}
3682		r = r600_bytecode_add_alu(ctx->bc, &alu);
3683		if (r)
3684			return r;
3685	}
3686	return 0;
3687}
3688
3689static int tgsi_op3(struct r600_shader_ctx *ctx)
3690{
3691	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3692	struct r600_bytecode_alu alu;
3693	int i, j, r;
3694	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3695
3696	for (i = 0; i < lasti + 1; i++) {
3697		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3698			continue;
3699
3700		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3701		alu.inst = ctx->inst_info->r600_opcode;
3702		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3703			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3704		}
3705
3706		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3707		alu.dst.chan = i;
3708		alu.dst.write = 1;
3709		alu.is_op3 = 1;
3710		if (i == lasti) {
3711			alu.last = 1;
3712		}
3713		r = r600_bytecode_add_alu(ctx->bc, &alu);
3714		if (r)
3715			return r;
3716	}
3717	return 0;
3718}
3719
3720static int tgsi_dp(struct r600_shader_ctx *ctx)
3721{
3722	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3723	struct r600_bytecode_alu alu;
3724	int i, j, r;
3725
3726	for (i = 0; i < 4; i++) {
3727		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3728		alu.inst = ctx->inst_info->r600_opcode;
3729		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3730			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3731		}
3732
3733		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3734		alu.dst.chan = i;
3735		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3736		/* handle some special cases */
3737		switch (ctx->inst_info->tgsi_opcode) {
3738		case TGSI_OPCODE_DP2:
3739			if (i > 1) {
3740				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3741				alu.src[0].chan = alu.src[1].chan = 0;
3742			}
3743			break;
3744		case TGSI_OPCODE_DP3:
3745			if (i > 2) {
3746				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3747				alu.src[0].chan = alu.src[1].chan = 0;
3748			}
3749			break;
3750		case TGSI_OPCODE_DPH:
3751			if (i == 3) {
3752				alu.src[0].sel = V_SQ_ALU_SRC_1;
3753				alu.src[0].chan = 0;
3754				alu.src[0].neg = 0;
3755			}
3756			break;
3757		default:
3758			break;
3759		}
3760		if (i == 3) {
3761			alu.last = 1;
3762		}
3763		r = r600_bytecode_add_alu(ctx->bc, &alu);
3764		if (r)
3765			return r;
3766	}
3767	return 0;
3768}
3769
3770static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3771						    unsigned index)
3772{
3773	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3774	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3775		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3776		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3777		ctx->src[index].neg || ctx->src[index].abs;
3778}
3779
3780static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3781					unsigned index)
3782{
3783	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3784	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3785}
3786
3787static int tgsi_tex(struct r600_shader_ctx *ctx)
3788{
3789	static float one_point_five = 1.5f;
3790	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3791	struct r600_bytecode_tex tex;
3792	struct r600_bytecode_alu alu;
3793	unsigned src_gpr;
3794	int r, i, j;
3795	int opcode;
3796	/* Texture fetch instructions can only use gprs as source.
3797	 * Also they cannot negate the source or take the absolute value */
3798	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3799                                             tgsi_tex_src_requires_loading(ctx, 0);
3800	boolean src_loaded = FALSE;
3801	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3802	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3803
3804	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3805
3806	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3807		/* get offset values */
3808		if (inst->Texture.NumOffsets) {
3809			assert(inst->Texture.NumOffsets == 1);
3810
3811			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3812			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3813			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3814		}
3815	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3816		/* TGSI moves the sampler to src reg 3 for TXD */
3817		sampler_src_reg = 3;
3818
3819		for (i = 1; i < 3; i++) {
3820			/* set gradients h/v */
3821			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3822			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3823				SQ_TEX_INST_SET_GRADIENTS_V;
3824			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3825			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3826
3827			if (tgsi_tex_src_requires_loading(ctx, i)) {
3828				tex.src_gpr = r600_get_temp(ctx);
3829				tex.src_sel_x = 0;
3830				tex.src_sel_y = 1;
3831				tex.src_sel_z = 2;
3832				tex.src_sel_w = 3;
3833
3834				for (j = 0; j < 4; j++) {
3835					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3836					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3837                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3838                                        alu.dst.sel = tex.src_gpr;
3839                                        alu.dst.chan = j;
3840                                        if (j == 3)
3841                                                alu.last = 1;
3842                                        alu.dst.write = 1;
3843                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3844                                        if (r)
3845                                                return r;
3846				}
3847
3848			} else {
3849				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3850				tex.src_sel_x = ctx->src[i].swizzle[0];
3851				tex.src_sel_y = ctx->src[i].swizzle[1];
3852				tex.src_sel_z = ctx->src[i].swizzle[2];
3853				tex.src_sel_w = ctx->src[i].swizzle[3];
3854				tex.src_rel = ctx->src[i].rel;
3855			}
3856			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3857			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3858			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3859				tex.coord_type_x = 1;
3860				tex.coord_type_y = 1;
3861				tex.coord_type_z = 1;
3862				tex.coord_type_w = 1;
3863			}
3864			r = r600_bytecode_add_tex(ctx->bc, &tex);
3865			if (r)
3866				return r;
3867		}
3868	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3869		int out_chan;
3870		/* Add perspective divide */
3871		if (ctx->bc->chip_class == CAYMAN) {
3872			out_chan = 2;
3873			for (i = 0; i < 3; i++) {
3874				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3875				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3876				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3877
3878				alu.dst.sel = ctx->temp_reg;
3879				alu.dst.chan = i;
3880				if (i == 2)
3881					alu.last = 1;
3882				if (out_chan == i)
3883					alu.dst.write = 1;
3884				r = r600_bytecode_add_alu(ctx->bc, &alu);
3885				if (r)
3886					return r;
3887			}
3888
3889		} else {
3890			out_chan = 3;
3891			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3892			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3893			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3894
3895			alu.dst.sel = ctx->temp_reg;
3896			alu.dst.chan = out_chan;
3897			alu.last = 1;
3898			alu.dst.write = 1;
3899			r = r600_bytecode_add_alu(ctx->bc, &alu);
3900			if (r)
3901				return r;
3902		}
3903
3904		for (i = 0; i < 3; i++) {
3905			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3906			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3907			alu.src[0].sel = ctx->temp_reg;
3908			alu.src[0].chan = out_chan;
3909			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3910			alu.dst.sel = ctx->temp_reg;
3911			alu.dst.chan = i;
3912			alu.dst.write = 1;
3913			r = r600_bytecode_add_alu(ctx->bc, &alu);
3914			if (r)
3915				return r;
3916		}
3917		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3918		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3919		alu.src[0].sel = V_SQ_ALU_SRC_1;
3920		alu.src[0].chan = 0;
3921		alu.dst.sel = ctx->temp_reg;
3922		alu.dst.chan = 3;
3923		alu.last = 1;
3924		alu.dst.write = 1;
3925		r = r600_bytecode_add_alu(ctx->bc, &alu);
3926		if (r)
3927			return r;
3928		src_loaded = TRUE;
3929		src_gpr = ctx->temp_reg;
3930	}
3931
3932	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3933	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3934	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3935	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3936
3937		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3938		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3939
3940		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3941		for (i = 0; i < 4; i++) {
3942			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3943			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3944			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3945			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3946			alu.dst.sel = ctx->temp_reg;
3947			alu.dst.chan = i;
3948			if (i == 3)
3949				alu.last = 1;
3950			alu.dst.write = 1;
3951			r = r600_bytecode_add_alu(ctx->bc, &alu);
3952			if (r)
3953				return r;
3954		}
3955
3956		/* tmp1.z = RCP_e(|tmp1.z|) */
3957		if (ctx->bc->chip_class == CAYMAN) {
3958			for (i = 0; i < 3; i++) {
3959				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3960				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3961				alu.src[0].sel = ctx->temp_reg;
3962				alu.src[0].chan = 2;
3963				alu.src[0].abs = 1;
3964				alu.dst.sel = ctx->temp_reg;
3965				alu.dst.chan = i;
3966				if (i == 2)
3967					alu.dst.write = 1;
3968				if (i == 2)
3969					alu.last = 1;
3970				r = r600_bytecode_add_alu(ctx->bc, &alu);
3971				if (r)
3972					return r;
3973			}
3974		} else {
3975			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3976			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3977			alu.src[0].sel = ctx->temp_reg;
3978			alu.src[0].chan = 2;
3979			alu.src[0].abs = 1;
3980			alu.dst.sel = ctx->temp_reg;
3981			alu.dst.chan = 2;
3982			alu.dst.write = 1;
3983			alu.last = 1;
3984			r = r600_bytecode_add_alu(ctx->bc, &alu);
3985			if (r)
3986				return r;
3987		}
3988
3989		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3990		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3991		 * muladd has no writemask, have to use another temp
3992		 */
3993		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3994		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3995		alu.is_op3 = 1;
3996
3997		alu.src[0].sel = ctx->temp_reg;
3998		alu.src[0].chan = 0;
3999		alu.src[1].sel = ctx->temp_reg;
4000		alu.src[1].chan = 2;
4001
4002		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4003		alu.src[2].chan = 0;
4004		alu.src[2].value = *(uint32_t *)&one_point_five;
4005
4006		alu.dst.sel = ctx->temp_reg;
4007		alu.dst.chan = 0;
4008		alu.dst.write = 1;
4009
4010		r = r600_bytecode_add_alu(ctx->bc, &alu);
4011		if (r)
4012			return r;
4013
4014		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4015		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4016		alu.is_op3 = 1;
4017
4018		alu.src[0].sel = ctx->temp_reg;
4019		alu.src[0].chan = 1;
4020		alu.src[1].sel = ctx->temp_reg;
4021		alu.src[1].chan = 2;
4022
4023		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4024		alu.src[2].chan = 0;
4025		alu.src[2].value = *(uint32_t *)&one_point_five;
4026
4027		alu.dst.sel = ctx->temp_reg;
4028		alu.dst.chan = 1;
4029		alu.dst.write = 1;
4030
4031		alu.last = 1;
4032		r = r600_bytecode_add_alu(ctx->bc, &alu);
4033		if (r)
4034			return r;
4035		/* write initial W value into Z component */
4036		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4037			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4038			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4039			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4040			alu.dst.sel = ctx->temp_reg;
4041			alu.dst.chan = 2;
4042			alu.dst.write = 1;
4043			alu.last = 1;
4044			r = r600_bytecode_add_alu(ctx->bc, &alu);
4045			if (r)
4046				return r;
4047		}
4048		src_loaded = TRUE;
4049		src_gpr = ctx->temp_reg;
4050	}
4051
4052	if (src_requires_loading && !src_loaded) {
4053		for (i = 0; i < 4; i++) {
4054			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4055			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4056			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4057			alu.dst.sel = ctx->temp_reg;
4058			alu.dst.chan = i;
4059			if (i == 3)
4060				alu.last = 1;
4061			alu.dst.write = 1;
4062			r = r600_bytecode_add_alu(ctx->bc, &alu);
4063			if (r)
4064				return r;
4065		}
4066		src_loaded = TRUE;
4067		src_gpr = ctx->temp_reg;
4068	}
4069
4070	opcode = ctx->inst_info->r600_opcode;
4071	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4072	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4073	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4074	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4075	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4076	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4077		switch (opcode) {
4078		case SQ_TEX_INST_SAMPLE:
4079			opcode = SQ_TEX_INST_SAMPLE_C;
4080			break;
4081		case SQ_TEX_INST_SAMPLE_L:
4082			opcode = SQ_TEX_INST_SAMPLE_C_L;
4083			break;
4084		case SQ_TEX_INST_SAMPLE_LB:
4085			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4086			break;
4087		case SQ_TEX_INST_SAMPLE_G:
4088			opcode = SQ_TEX_INST_SAMPLE_C_G;
4089			break;
4090		}
4091	}
4092
4093	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4094	tex.inst = opcode;
4095
4096	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4097	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4098	tex.src_gpr = src_gpr;
4099	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4100	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4101	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4102	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4103	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4104
4105	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4106		tex.src_sel_x = 4;
4107		tex.src_sel_y = 4;
4108		tex.src_sel_z = 4;
4109		tex.src_sel_w = 4;
4110	} else if (src_loaded) {
4111		tex.src_sel_x = 0;
4112		tex.src_sel_y = 1;
4113		tex.src_sel_z = 2;
4114		tex.src_sel_w = 3;
4115	} else {
4116		tex.src_sel_x = ctx->src[0].swizzle[0];
4117		tex.src_sel_y = ctx->src[0].swizzle[1];
4118		tex.src_sel_z = ctx->src[0].swizzle[2];
4119		tex.src_sel_w = ctx->src[0].swizzle[3];
4120		tex.src_rel = ctx->src[0].rel;
4121	}
4122
4123	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4124		tex.src_sel_x = 1;
4125		tex.src_sel_y = 0;
4126		tex.src_sel_z = 3;
4127		tex.src_sel_w = 1;
4128	}
4129	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4130		tex.src_sel_x = 1;
4131		tex.src_sel_y = 0;
4132		tex.src_sel_z = 3;
4133		tex.src_sel_w = 2; /* route Z compare value into W */
4134	}
4135
4136	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4137	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4138		tex.coord_type_x = 1;
4139		tex.coord_type_y = 1;
4140	}
4141	tex.coord_type_z = 1;
4142	tex.coord_type_w = 1;
4143
4144	tex.offset_x = offset_x;
4145	tex.offset_y = offset_y;
4146	tex.offset_z = offset_z;
4147
4148	/* Put the depth for comparison in W.
4149	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4150	 * Some instructions expect the depth in Z. */
4151	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4152	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4153	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4154	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4155	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4156	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4157		tex.src_sel_w = tex.src_sel_z;
4158	}
4159
4160	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4161	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4162		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4163		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4164			/* the array index is read from Y */
4165			tex.coord_type_y = 0;
4166		} else {
4167			/* the array index is read from Z */
4168			tex.coord_type_z = 0;
4169			tex.src_sel_z = tex.src_sel_y;
4170		}
4171	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4172		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4173		/* the array index is read from Z */
4174		tex.coord_type_z = 0;
4175
4176	r = r600_bytecode_add_tex(ctx->bc, &tex);
4177	if (r)
4178		return r;
4179
4180	/* add shadow ambient support  - gallium doesn't do it yet */
4181	return 0;
4182}
4183
4184static int tgsi_lrp(struct r600_shader_ctx *ctx)
4185{
4186	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4187	struct r600_bytecode_alu alu;
4188	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4189	unsigned i;
4190	int r;
4191
4192	/* optimize if it's just an equal balance */
4193	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4194		for (i = 0; i < lasti + 1; i++) {
4195			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4196				continue;
4197
4198			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4199			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4200			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4201			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4202			alu.omod = 3;
4203			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4204			alu.dst.chan = i;
4205			if (i == lasti) {
4206				alu.last = 1;
4207			}
4208			r = r600_bytecode_add_alu(ctx->bc, &alu);
4209			if (r)
4210				return r;
4211		}
4212		return 0;
4213	}
4214
4215	/* 1 - src0 */
4216	for (i = 0; i < lasti + 1; i++) {
4217		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4218			continue;
4219
4220		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4221		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4222		alu.src[0].sel = V_SQ_ALU_SRC_1;
4223		alu.src[0].chan = 0;
4224		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4225		r600_bytecode_src_toggle_neg(&alu.src[1]);
4226		alu.dst.sel = ctx->temp_reg;
4227		alu.dst.chan = i;
4228		if (i == lasti) {
4229			alu.last = 1;
4230		}
4231		alu.dst.write = 1;
4232		r = r600_bytecode_add_alu(ctx->bc, &alu);
4233		if (r)
4234			return r;
4235	}
4236
4237	/* (1 - src0) * src2 */
4238	for (i = 0; i < lasti + 1; i++) {
4239		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4240			continue;
4241
4242		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4243		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4244		alu.src[0].sel = ctx->temp_reg;
4245		alu.src[0].chan = i;
4246		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4247		alu.dst.sel = ctx->temp_reg;
4248		alu.dst.chan = i;
4249		if (i == lasti) {
4250			alu.last = 1;
4251		}
4252		alu.dst.write = 1;
4253		r = r600_bytecode_add_alu(ctx->bc, &alu);
4254		if (r)
4255			return r;
4256	}
4257
4258	/* src0 * src1 + (1 - src0) * src2 */
4259	for (i = 0; i < lasti + 1; i++) {
4260		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4261			continue;
4262
4263		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4264		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4265		alu.is_op3 = 1;
4266		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4267		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4268		alu.src[2].sel = ctx->temp_reg;
4269		alu.src[2].chan = i;
4270
4271		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4272		alu.dst.chan = i;
4273		if (i == lasti) {
4274			alu.last = 1;
4275		}
4276		r = r600_bytecode_add_alu(ctx->bc, &alu);
4277		if (r)
4278			return r;
4279	}
4280	return 0;
4281}
4282
4283static int tgsi_cmp(struct r600_shader_ctx *ctx)
4284{
4285	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4286	struct r600_bytecode_alu alu;
4287	int i, r;
4288	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4289
4290	for (i = 0; i < lasti + 1; i++) {
4291		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4292			continue;
4293
4294		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4295		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4296		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4297		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4298		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4299		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4300		alu.dst.chan = i;
4301		alu.dst.write = 1;
4302		alu.is_op3 = 1;
4303		if (i == lasti)
4304			alu.last = 1;
4305		r = r600_bytecode_add_alu(ctx->bc, &alu);
4306		if (r)
4307			return r;
4308	}
4309	return 0;
4310}
4311
4312static int tgsi_xpd(struct r600_shader_ctx *ctx)
4313{
4314	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4315	static const unsigned int src0_swizzle[] = {2, 0, 1};
4316	static const unsigned int src1_swizzle[] = {1, 2, 0};
4317	struct r600_bytecode_alu alu;
4318	uint32_t use_temp = 0;
4319	int i, r;
4320
4321	if (inst->Dst[0].Register.WriteMask != 0xf)
4322		use_temp = 1;
4323
4324	for (i = 0; i < 4; i++) {
4325		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4326		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4327		if (i < 3) {
4328			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4329			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4330		} else {
4331			alu.src[0].sel = V_SQ_ALU_SRC_0;
4332			alu.src[0].chan = i;
4333			alu.src[1].sel = V_SQ_ALU_SRC_0;
4334			alu.src[1].chan = i;
4335		}
4336
4337		alu.dst.sel = ctx->temp_reg;
4338		alu.dst.chan = i;
4339		alu.dst.write = 1;
4340
4341		if (i == 3)
4342			alu.last = 1;
4343		r = r600_bytecode_add_alu(ctx->bc, &alu);
4344		if (r)
4345			return r;
4346	}
4347
4348	for (i = 0; i < 4; i++) {
4349		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4350		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4351
4352		if (i < 3) {
4353			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4354			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4355		} else {
4356			alu.src[0].sel = V_SQ_ALU_SRC_0;
4357			alu.src[0].chan = i;
4358			alu.src[1].sel = V_SQ_ALU_SRC_0;
4359			alu.src[1].chan = i;
4360		}
4361
4362		alu.src[2].sel = ctx->temp_reg;
4363		alu.src[2].neg = 1;
4364		alu.src[2].chan = i;
4365
4366		if (use_temp)
4367			alu.dst.sel = ctx->temp_reg;
4368		else
4369			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4370		alu.dst.chan = i;
4371		alu.dst.write = 1;
4372		alu.is_op3 = 1;
4373		if (i == 3)
4374			alu.last = 1;
4375		r = r600_bytecode_add_alu(ctx->bc, &alu);
4376		if (r)
4377			return r;
4378	}
4379	if (use_temp)
4380		return tgsi_helper_copy(ctx, inst);
4381	return 0;
4382}
4383
4384static int tgsi_exp(struct r600_shader_ctx *ctx)
4385{
4386	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4387	struct r600_bytecode_alu alu;
4388	int r;
4389	int i;
4390
4391	/* result.x = 2^floor(src); */
4392	if (inst->Dst[0].Register.WriteMask & 1) {
4393		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4394
4395		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4396		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4397
4398		alu.dst.sel = ctx->temp_reg;
4399		alu.dst.chan = 0;
4400		alu.dst.write = 1;
4401		alu.last = 1;
4402		r = r600_bytecode_add_alu(ctx->bc, &alu);
4403		if (r)
4404			return r;
4405
4406		if (ctx->bc->chip_class == CAYMAN) {
4407			for (i = 0; i < 3; i++) {
4408				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4409				alu.src[0].sel = ctx->temp_reg;
4410				alu.src[0].chan = 0;
4411
4412				alu.dst.sel = ctx->temp_reg;
4413				alu.dst.chan = i;
4414				alu.dst.write = i == 0;
4415				alu.last = i == 2;
4416				r = r600_bytecode_add_alu(ctx->bc, &alu);
4417				if (r)
4418					return r;
4419			}
4420		} else {
4421			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4422			alu.src[0].sel = ctx->temp_reg;
4423			alu.src[0].chan = 0;
4424
4425			alu.dst.sel = ctx->temp_reg;
4426			alu.dst.chan = 0;
4427			alu.dst.write = 1;
4428			alu.last = 1;
4429			r = r600_bytecode_add_alu(ctx->bc, &alu);
4430			if (r)
4431				return r;
4432		}
4433	}
4434
4435	/* result.y = tmp - floor(tmp); */
4436	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4437		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4438
4439		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4440		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4441
4442		alu.dst.sel = ctx->temp_reg;
4443#if 0
4444		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4445		if (r)
4446			return r;
4447#endif
4448		alu.dst.write = 1;
4449		alu.dst.chan = 1;
4450
4451		alu.last = 1;
4452
4453		r = r600_bytecode_add_alu(ctx->bc, &alu);
4454		if (r)
4455			return r;
4456	}
4457
4458	/* result.z = RoughApprox2ToX(tmp);*/
4459	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4460		if (ctx->bc->chip_class == CAYMAN) {
4461			for (i = 0; i < 3; i++) {
4462				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4463				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4464				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4465
4466				alu.dst.sel = ctx->temp_reg;
4467				alu.dst.chan = i;
4468				if (i == 2) {
4469					alu.dst.write = 1;
4470					alu.last = 1;
4471				}
4472
4473				r = r600_bytecode_add_alu(ctx->bc, &alu);
4474				if (r)
4475					return r;
4476			}
4477		} else {
4478			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4479			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4480			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4481
4482			alu.dst.sel = ctx->temp_reg;
4483			alu.dst.write = 1;
4484			alu.dst.chan = 2;
4485
4486			alu.last = 1;
4487
4488			r = r600_bytecode_add_alu(ctx->bc, &alu);
4489			if (r)
4490				return r;
4491		}
4492	}
4493
4494	/* result.w = 1.0;*/
4495	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4496		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4497
4498		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4499		alu.src[0].sel = V_SQ_ALU_SRC_1;
4500		alu.src[0].chan = 0;
4501
4502		alu.dst.sel = ctx->temp_reg;
4503		alu.dst.chan = 3;
4504		alu.dst.write = 1;
4505		alu.last = 1;
4506		r = r600_bytecode_add_alu(ctx->bc, &alu);
4507		if (r)
4508			return r;
4509	}
4510	return tgsi_helper_copy(ctx, inst);
4511}
4512
4513static int tgsi_log(struct r600_shader_ctx *ctx)
4514{
4515	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4516	struct r600_bytecode_alu alu;
4517	int r;
4518	int i;
4519
4520	/* result.x = floor(log2(|src|)); */
4521	if (inst->Dst[0].Register.WriteMask & 1) {
4522		if (ctx->bc->chip_class == CAYMAN) {
4523			for (i = 0; i < 3; i++) {
4524				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4525
4526				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4527				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4528				r600_bytecode_src_set_abs(&alu.src[0]);
4529
4530				alu.dst.sel = ctx->temp_reg;
4531				alu.dst.chan = i;
4532				if (i == 0)
4533					alu.dst.write = 1;
4534				if (i == 2)
4535					alu.last = 1;
4536				r = r600_bytecode_add_alu(ctx->bc, &alu);
4537				if (r)
4538					return r;
4539			}
4540
4541		} else {
4542			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4543
4544			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4545			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4546			r600_bytecode_src_set_abs(&alu.src[0]);
4547
4548			alu.dst.sel = ctx->temp_reg;
4549			alu.dst.chan = 0;
4550			alu.dst.write = 1;
4551			alu.last = 1;
4552			r = r600_bytecode_add_alu(ctx->bc, &alu);
4553			if (r)
4554				return r;
4555		}
4556
4557		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4558		alu.src[0].sel = ctx->temp_reg;
4559		alu.src[0].chan = 0;
4560
4561		alu.dst.sel = ctx->temp_reg;
4562		alu.dst.chan = 0;
4563		alu.dst.write = 1;
4564		alu.last = 1;
4565
4566		r = r600_bytecode_add_alu(ctx->bc, &alu);
4567		if (r)
4568			return r;
4569	}
4570
4571	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4572	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4573
4574		if (ctx->bc->chip_class == CAYMAN) {
4575			for (i = 0; i < 3; i++) {
4576				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4577
4578				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4579				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4580				r600_bytecode_src_set_abs(&alu.src[0]);
4581
4582				alu.dst.sel = ctx->temp_reg;
4583				alu.dst.chan = i;
4584				if (i == 1)
4585					alu.dst.write = 1;
4586				if (i == 2)
4587					alu.last = 1;
4588
4589				r = r600_bytecode_add_alu(ctx->bc, &alu);
4590				if (r)
4591					return r;
4592			}
4593		} else {
4594			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4595
4596			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4597			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4598			r600_bytecode_src_set_abs(&alu.src[0]);
4599
4600			alu.dst.sel = ctx->temp_reg;
4601			alu.dst.chan = 1;
4602			alu.dst.write = 1;
4603			alu.last = 1;
4604
4605			r = r600_bytecode_add_alu(ctx->bc, &alu);
4606			if (r)
4607				return r;
4608		}
4609
4610		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4611
4612		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4613		alu.src[0].sel = ctx->temp_reg;
4614		alu.src[0].chan = 1;
4615
4616		alu.dst.sel = ctx->temp_reg;
4617		alu.dst.chan = 1;
4618		alu.dst.write = 1;
4619		alu.last = 1;
4620
4621		r = r600_bytecode_add_alu(ctx->bc, &alu);
4622		if (r)
4623			return r;
4624
4625		if (ctx->bc->chip_class == CAYMAN) {
4626			for (i = 0; i < 3; i++) {
4627				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4628				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4629				alu.src[0].sel = ctx->temp_reg;
4630				alu.src[0].chan = 1;
4631
4632				alu.dst.sel = ctx->temp_reg;
4633				alu.dst.chan = i;
4634				if (i == 1)
4635					alu.dst.write = 1;
4636				if (i == 2)
4637					alu.last = 1;
4638
4639				r = r600_bytecode_add_alu(ctx->bc, &alu);
4640				if (r)
4641					return r;
4642			}
4643		} else {
4644			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4645			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4646			alu.src[0].sel = ctx->temp_reg;
4647			alu.src[0].chan = 1;
4648
4649			alu.dst.sel = ctx->temp_reg;
4650			alu.dst.chan = 1;
4651			alu.dst.write = 1;
4652			alu.last = 1;
4653
4654			r = r600_bytecode_add_alu(ctx->bc, &alu);
4655			if (r)
4656				return r;
4657		}
4658
4659		if (ctx->bc->chip_class == CAYMAN) {
4660			for (i = 0; i < 3; i++) {
4661				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4662				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4663				alu.src[0].sel = ctx->temp_reg;
4664				alu.src[0].chan = 1;
4665
4666				alu.dst.sel = ctx->temp_reg;
4667				alu.dst.chan = i;
4668				if (i == 1)
4669					alu.dst.write = 1;
4670				if (i == 2)
4671					alu.last = 1;
4672
4673				r = r600_bytecode_add_alu(ctx->bc, &alu);
4674				if (r)
4675					return r;
4676			}
4677		} else {
4678			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4679			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4680			alu.src[0].sel = ctx->temp_reg;
4681			alu.src[0].chan = 1;
4682
4683			alu.dst.sel = ctx->temp_reg;
4684			alu.dst.chan = 1;
4685			alu.dst.write = 1;
4686			alu.last = 1;
4687
4688			r = r600_bytecode_add_alu(ctx->bc, &alu);
4689			if (r)
4690				return r;
4691		}
4692
4693		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4694
4695		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4696
4697		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4698		r600_bytecode_src_set_abs(&alu.src[0]);
4699
4700		alu.src[1].sel = ctx->temp_reg;
4701		alu.src[1].chan = 1;
4702
4703		alu.dst.sel = ctx->temp_reg;
4704		alu.dst.chan = 1;
4705		alu.dst.write = 1;
4706		alu.last = 1;
4707
4708		r = r600_bytecode_add_alu(ctx->bc, &alu);
4709		if (r)
4710			return r;
4711	}
4712
4713	/* result.z = log2(|src|);*/
4714	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4715		if (ctx->bc->chip_class == CAYMAN) {
4716			for (i = 0; i < 3; i++) {
4717				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4718
4719				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4720				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4721				r600_bytecode_src_set_abs(&alu.src[0]);
4722
4723				alu.dst.sel = ctx->temp_reg;
4724				if (i == 2)
4725					alu.dst.write = 1;
4726				alu.dst.chan = i;
4727				if (i == 2)
4728					alu.last = 1;
4729
4730				r = r600_bytecode_add_alu(ctx->bc, &alu);
4731				if (r)
4732					return r;
4733			}
4734		} else {
4735			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4736
4737			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4738			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4739			r600_bytecode_src_set_abs(&alu.src[0]);
4740
4741			alu.dst.sel = ctx->temp_reg;
4742			alu.dst.write = 1;
4743			alu.dst.chan = 2;
4744			alu.last = 1;
4745
4746			r = r600_bytecode_add_alu(ctx->bc, &alu);
4747			if (r)
4748				return r;
4749		}
4750	}
4751
4752	/* result.w = 1.0; */
4753	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4754		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4755
4756		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4757		alu.src[0].sel = V_SQ_ALU_SRC_1;
4758		alu.src[0].chan = 0;
4759
4760		alu.dst.sel = ctx->temp_reg;
4761		alu.dst.chan = 3;
4762		alu.dst.write = 1;
4763		alu.last = 1;
4764
4765		r = r600_bytecode_add_alu(ctx->bc, &alu);
4766		if (r)
4767			return r;
4768	}
4769
4770	return tgsi_helper_copy(ctx, inst);
4771}
4772
4773static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4774{
4775	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4776	struct r600_bytecode_alu alu;
4777	int r;
4778
4779	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4780
4781	switch (inst->Instruction.Opcode) {
4782	case TGSI_OPCODE_ARL:
4783		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4784		break;
4785	case TGSI_OPCODE_ARR:
4786		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4787		break;
4788	case TGSI_OPCODE_UARL:
4789		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4790		break;
4791	default:
4792		assert(0);
4793		return -1;
4794	}
4795
4796	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4797	alu.last = 1;
4798	alu.dst.sel = ctx->bc->ar_reg;
4799	alu.dst.write = 1;
4800	r = r600_bytecode_add_alu(ctx->bc, &alu);
4801	if (r)
4802		return r;
4803
4804	ctx->bc->ar_loaded = 0;
4805	return 0;
4806}
4807static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4808{
4809	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4810	struct r600_bytecode_alu alu;
4811	int r;
4812
4813	switch (inst->Instruction.Opcode) {
4814	case TGSI_OPCODE_ARL:
4815		memset(&alu, 0, sizeof(alu));
4816		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4817		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4818		alu.dst.sel = ctx->bc->ar_reg;
4819		alu.dst.write = 1;
4820		alu.last = 1;
4821
4822		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4823			return r;
4824
4825		memset(&alu, 0, sizeof(alu));
4826		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4827		alu.src[0].sel = ctx->bc->ar_reg;
4828		alu.dst.sel = ctx->bc->ar_reg;
4829		alu.dst.write = 1;
4830		alu.last = 1;
4831
4832		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4833			return r;
4834		break;
4835	case TGSI_OPCODE_ARR:
4836		memset(&alu, 0, sizeof(alu));
4837		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4838		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4839		alu.dst.sel = ctx->bc->ar_reg;
4840		alu.dst.write = 1;
4841		alu.last = 1;
4842
4843		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4844			return r;
4845		break;
4846	case TGSI_OPCODE_UARL:
4847		memset(&alu, 0, sizeof(alu));
4848		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4849		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4850		alu.dst.sel = ctx->bc->ar_reg;
4851		alu.dst.write = 1;
4852		alu.last = 1;
4853
4854		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4855			return r;
4856		break;
4857	default:
4858		assert(0);
4859		return -1;
4860	}
4861
4862	ctx->bc->ar_loaded = 0;
4863	return 0;
4864}
4865
4866static int tgsi_opdst(struct r600_shader_ctx *ctx)
4867{
4868	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4869	struct r600_bytecode_alu alu;
4870	int i, r = 0;
4871
4872	for (i = 0; i < 4; i++) {
4873		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4874
4875		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4876		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4877
4878		if (i == 0 || i == 3) {
4879			alu.src[0].sel = V_SQ_ALU_SRC_1;
4880		} else {
4881			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4882		}
4883
4884		if (i == 0 || i == 2) {
4885			alu.src[1].sel = V_SQ_ALU_SRC_1;
4886		} else {
4887			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4888		}
4889		if (i == 3)
4890			alu.last = 1;
4891		r = r600_bytecode_add_alu(ctx->bc, &alu);
4892		if (r)
4893			return r;
4894	}
4895	return 0;
4896}
4897
4898static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4899{
4900	struct r600_bytecode_alu alu;
4901	int r;
4902
4903	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4904	alu.inst = opcode;
4905	alu.execute_mask = 1;
4906	alu.update_pred = 1;
4907
4908	alu.dst.sel = ctx->temp_reg;
4909	alu.dst.write = 1;
4910	alu.dst.chan = 0;
4911
4912	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4913	alu.src[1].sel = V_SQ_ALU_SRC_0;
4914	alu.src[1].chan = 0;
4915
4916	alu.last = 1;
4917
4918	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4919	if (r)
4920		return r;
4921	return 0;
4922}
4923
4924static int pops(struct r600_shader_ctx *ctx, int pops)
4925{
4926	unsigned force_pop = ctx->bc->force_add_cf;
4927
4928	if (!force_pop) {
4929		int alu_pop = 3;
4930		if (ctx->bc->cf_last) {
4931			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4932				alu_pop = 0;
4933			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4934				alu_pop = 1;
4935		}
4936		alu_pop += pops;
4937		if (alu_pop == 1) {
4938			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4939			ctx->bc->force_add_cf = 1;
4940		} else if (alu_pop == 2) {
4941			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4942			ctx->bc->force_add_cf = 1;
4943		} else {
4944			force_pop = 1;
4945		}
4946	}
4947
4948	if (force_pop) {
4949		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4950		ctx->bc->cf_last->pop_count = pops;
4951		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4952	}
4953
4954	return 0;
4955}
4956
4957static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4958{
4959	switch(reason) {
4960	case FC_PUSH_VPM:
4961		ctx->bc->callstack[ctx->bc->call_sp].current--;
4962		break;
4963	case FC_PUSH_WQM:
4964	case FC_LOOP:
4965		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4966		break;
4967	case FC_REP:
4968		/* TOODO : for 16 vp asic should -= 2; */
4969		ctx->bc->callstack[ctx->bc->call_sp].current --;
4970		break;
4971	}
4972}
4973
4974static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4975{
4976	if (check_max_only) {
4977		int diff;
4978		switch (reason) {
4979		case FC_PUSH_VPM:
4980			diff = 1;
4981			break;
4982		case FC_PUSH_WQM:
4983			diff = 4;
4984			break;
4985		default:
4986			assert(0);
4987			diff = 0;
4988		}
4989		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4990		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4991			ctx->bc->callstack[ctx->bc->call_sp].max =
4992				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4993		}
4994		return;
4995	}
4996	switch (reason) {
4997	case FC_PUSH_VPM:
4998		ctx->bc->callstack[ctx->bc->call_sp].current++;
4999		break;
5000	case FC_PUSH_WQM:
5001	case FC_LOOP:
5002		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
5003		break;
5004	case FC_REP:
5005		ctx->bc->callstack[ctx->bc->call_sp].current++;
5006		break;
5007	}
5008
5009	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
5010	    ctx->bc->callstack[ctx->bc->call_sp].max) {
5011		ctx->bc->callstack[ctx->bc->call_sp].max =
5012			ctx->bc->callstack[ctx->bc->call_sp].current;
5013	}
5014}
5015
5016static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
5017{
5018	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
5019
5020	sp->mid = realloc((void *)sp->mid,
5021						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5022	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5023	sp->num_mid++;
5024}
5025
5026static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5027{
5028	ctx->bc->fc_sp++;
5029	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5030	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5031}
5032
5033static void fc_poplevel(struct r600_shader_ctx *ctx)
5034{
5035	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5036	free(sp->mid);
5037	sp->mid = NULL;
5038	sp->num_mid = 0;
5039	sp->start = NULL;
5040	sp->type = 0;
5041	ctx->bc->fc_sp--;
5042}
5043
5044#if 0
5045static int emit_return(struct r600_shader_ctx *ctx)
5046{
5047	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5048	return 0;
5049}
5050
5051static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5052{
5053
5054	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5055	ctx->bc->cf_last->pop_count = pops;
5056	/* XXX work out offset */
5057	return 0;
5058}
5059
5060static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5061{
5062	return 0;
5063}
5064
5065static void emit_testflag(struct r600_shader_ctx *ctx)
5066{
5067
5068}
5069
5070static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5071{
5072	emit_testflag(ctx);
5073	emit_jump_to_offset(ctx, 1, 4);
5074	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5075	pops(ctx, ifidx + 1);
5076	emit_return(ctx);
5077}
5078
5079static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5080{
5081	emit_testflag(ctx);
5082
5083	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5084	ctx->bc->cf_last->pop_count = 1;
5085
5086	fc_set_mid(ctx, fc_sp);
5087
5088	pops(ctx, 1);
5089}
5090#endif
5091
5092static int tgsi_if(struct r600_shader_ctx *ctx)
5093{
5094	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5095
5096	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5097
5098	fc_pushlevel(ctx, FC_IF);
5099
5100	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5101	return 0;
5102}
5103
5104static int tgsi_else(struct r600_shader_ctx *ctx)
5105{
5106	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5107	ctx->bc->cf_last->pop_count = 1;
5108
5109	fc_set_mid(ctx, ctx->bc->fc_sp);
5110	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5111	return 0;
5112}
5113
5114static int tgsi_endif(struct r600_shader_ctx *ctx)
5115{
5116	pops(ctx, 1);
5117	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5118		R600_ERR("if/endif unbalanced in shader\n");
5119		return -1;
5120	}
5121
5122	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5123		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5124		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5125	} else {
5126		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5127	}
5128	fc_poplevel(ctx);
5129
5130	callstack_decrease_current(ctx, FC_PUSH_VPM);
5131	return 0;
5132}
5133
5134static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5135{
5136	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
5137	 * limited to 4096 iterations, like the other LOOP_* instructions. */
5138	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
5139
5140	fc_pushlevel(ctx, FC_LOOP);
5141
5142	/* check stack depth */
5143	callstack_check_depth(ctx, FC_LOOP, 0);
5144	return 0;
5145}
5146
5147static int tgsi_endloop(struct r600_shader_ctx *ctx)
5148{
5149	int i;
5150
5151	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5152
5153	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5154		R600_ERR("loop/endloop in shader code are not paired.\n");
5155		return -EINVAL;
5156	}
5157
5158	/* fixup loop pointers - from r600isa
5159	   LOOP END points to CF after LOOP START,
5160	   LOOP START point to CF after LOOP END
5161	   BRK/CONT point to LOOP END CF
5162	*/
5163	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5164
5165	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5166
5167	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5168		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5169	}
5170	/* XXX add LOOPRET support */
5171	fc_poplevel(ctx);
5172	callstack_decrease_current(ctx, FC_LOOP);
5173	return 0;
5174}
5175
5176static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5177{
5178	unsigned int fscp;
5179
5180	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5181	{
5182		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5183			break;
5184	}
5185
5186	if (fscp == 0) {
5187		R600_ERR("Break not inside loop/endloop pair\n");
5188		return -EINVAL;
5189	}
5190
5191	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5192
5193	fc_set_mid(ctx, fscp);
5194
5195	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5196	return 0;
5197}
5198
5199static int tgsi_umad(struct r600_shader_ctx *ctx)
5200{
5201	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5202	struct r600_bytecode_alu alu;
5203	int i, j, r;
5204	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5205
5206	/* src0 * src1 */
5207	for (i = 0; i < lasti + 1; i++) {
5208		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5209			continue;
5210
5211		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5212
5213		alu.dst.chan = i;
5214		alu.dst.sel = ctx->temp_reg;
5215		alu.dst.write = 1;
5216
5217		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5218		for (j = 0; j < 2; j++) {
5219		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5220		}
5221
5222		alu.last = 1;
5223		r = r600_bytecode_add_alu(ctx->bc, &alu);
5224		if (r)
5225			return r;
5226	}
5227
5228
5229	for (i = 0; i < lasti + 1; i++) {
5230		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5231			continue;
5232
5233		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5234		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5235
5236		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5237
5238		alu.src[0].sel = ctx->temp_reg;
5239		alu.src[0].chan = i;
5240
5241		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5242		if (i == lasti) {
5243			alu.last = 1;
5244		}
5245		r = r600_bytecode_add_alu(ctx->bc, &alu);
5246		if (r)
5247			return r;
5248	}
5249	return 0;
5250}
5251
5252static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5253	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5254	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5255	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5256
5257	/* XXX:
5258	 * For state trackers other than OpenGL, we'll want to use
5259	 * _RECIP_IEEE instead.
5260	 */
5261	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5262
5263	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5264	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5265	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5266	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5267	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5268	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5269	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5270	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5271	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5272	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5273	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5274	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5275	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5276	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5277	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5278	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5279	/* gap */
5280	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5281	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5282	/* gap */
5283	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5284	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5285	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5286	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5287	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5288	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5289	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5290	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5291	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5292	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5293	/* gap */
5294	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5295	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5296	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5298	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5299	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5300	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5301	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5302	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5303	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5304	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5305	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5306	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5307	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5308	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5309	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5310	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5311	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5312	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5313	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5314	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5315	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5316	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5317	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5318	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5319	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5320	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5321	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5322	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5323	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5324	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5325	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5326	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5327	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5328	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5329	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5330	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5331	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5332	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5333	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5334	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5335	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5336	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5337	/* gap */
5338	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5339	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5340	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5341	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5342	/* gap */
5343	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5344	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5345	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5346	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5347	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5348	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5349	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5350	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5351	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5352	/* gap */
5353	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5354	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5355	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5356	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5357	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5358	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5359	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5360	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5361	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5362	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5363	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5364	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5365	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5366	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5367	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5368	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5369	/* gap */
5370	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5371	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5372	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5373	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5374	/* gap */
5375	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5376	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5377	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5378	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5379	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5380	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5381	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5382	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5383	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5384	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5385	/* gap */
5386	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5387	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5388	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5389	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5390	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5391	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5392	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5393	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5394	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5395	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5396	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5397	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5398	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5399	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5400	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5401	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5402	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5403	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5404	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5405	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5406	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5407	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5408	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5409	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5410	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5411	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5412	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5413	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5414	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5415	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5416	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5417	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5418	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5419	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5420	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5421	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5422	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5423	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5424	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5425	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5426	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5427	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5428	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5429	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5430};
5431
5432static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5433	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5434	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5435	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5436	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5437	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5438	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5439	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5440	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5441	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5442	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5443	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5444	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5445	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5446	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5447	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5448	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5449	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5450	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5451	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5452	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5453	/* gap */
5454	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5455	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5456	/* gap */
5457	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5458	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5459	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5460	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5461	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5462	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5463	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5464	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5465	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5466	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5467	/* gap */
5468	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5469	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5470	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5472	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5473	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5474	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5475	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5476	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5477	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5478	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5479	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5480	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5481	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5482	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5483	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5484	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5485	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5486	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5487	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5488	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5489	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5490	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5491	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5492	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5493	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5494	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5495	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5496	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5498	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5499	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5500	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5502	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5503	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5504	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5505	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5506	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5507	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5508	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5509	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5510	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5511	/* gap */
5512	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5513	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5514	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5515	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5516	/* gap */
5517	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5518	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5519	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5520	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5521	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5522	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5523	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5524	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5525	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5526	/* gap */
5527	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5528	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5529	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5530	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5531	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5532	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5533	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5534	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5535	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5536	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5537	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5538	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5539	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5540	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5541	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5542	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5543	/* gap */
5544	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5545	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5546	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5547	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5548	/* gap */
5549	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5550	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5551	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5552	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5553	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5554	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5555	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5556	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5557	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5558	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5559	/* gap */
5560	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5561	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5562	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5563	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5564	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5565	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5566	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5567	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5568	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5569	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5570	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5571	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5572	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5573	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5574	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5575	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5576	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5577	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5578	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5579	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5580	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5581	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5582	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5583	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5584	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5585	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5586	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5587	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5588	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5589	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5590	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5591	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5592	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5593	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5594	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5595	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5596	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5597	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5598	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5599	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5600	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5601	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5602	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5603	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5604};
5605
5606static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5607	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5608	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5609	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5610	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5611	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5612	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5613	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5614	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5615	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5616	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5617	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5618	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5619	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5620	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5621	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5622	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5623	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5624	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5625	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5626	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5627	/* gap */
5628	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5629	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5630	/* gap */
5631	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5632	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5633	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5634	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5635	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5636	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5637	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5638	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5639	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5640	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5641	/* gap */
5642	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5643	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5644	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5646	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5647	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5648	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5649	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5650	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5651	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5652	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5653	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5654	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5655	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5656	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5657	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5658	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5659	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5660	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5661	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5662	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5663	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5664	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5665	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5666	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5667	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5668	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5669	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5670	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5672	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5673	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5674	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5676	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5677	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5678	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5679	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5680	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5681	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5682	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5683	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5684	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5685	/* gap */
5686	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5687	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5688	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5689	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5690	/* gap */
5691	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5692	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5693	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5694	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5695	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5696	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5697	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5698	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5699	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5700	/* gap */
5701	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5702	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5703	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5704	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5705	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5706	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5707	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5708	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5709	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5710	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5711	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5712	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5713	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5714	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5715	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5716	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5717	/* gap */
5718	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5719	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5720	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5721	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5722	/* gap */
5723	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5724	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5725	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5726	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5727	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5728	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5729	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5730	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5731	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5732	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5733	/* gap */
5734	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5735	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5736	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5737	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5738	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5739	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5740	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5741	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5742	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5743	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5744	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5745	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5746	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5747	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5748	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5749	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5750	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5751	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5752	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5753	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5754	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5755	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5756	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5757	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5758	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5759	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5760	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5761	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5762	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5763	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5764	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5765	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5766	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5767	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5768	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5769	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5770	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5771	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5772	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5773	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5774	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5775	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5776	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5777	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5778};
5779