r600_shader.c revision 5ab82e0ccf84855e9311ebfc58d1b57b437ed991
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "pipe/p_shader_tokens.h"
31#include "tgsi/tgsi_info.h"
32#include "tgsi/tgsi_parse.h"
33#include "tgsi/tgsi_scan.h"
34#include "tgsi/tgsi_dump.h"
35#include "util/u_memory.h"
36#include <stdio.h>
37#include <errno.h>
38#include <byteswap.h>
39
40/* CAYMAN notes
41Why CAYMAN got loops for lots of instructions is explained here.
42
43-These 8xx t-slot only ops are implemented in all vector slots.
44MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
45These 8xx t-slot only opcodes become vector ops, with all four
46slots expecting the arguments on sources a and b. Result is
47broadcast to all channels.
48MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
49These 8xx t-slot only opcodes become vector ops in the z, y, and
50x slots.
51EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
52RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
53SQRT_IEEE/_64
54SIN/COS
55The w slot may have an independent co-issued operation, or if the
56result is required to be in the w slot, the opcode above may be
57issued in the w slot as well.
58The compiler must issue the source argument to slots z, y, and x
59*/
60
61static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
62{
63	struct r600_context *rctx = (struct r600_context *)ctx;
64	struct r600_shader *rshader = &shader->shader;
65	uint32_t *ptr;
66	int	i;
67
68	/* copy new shader */
69	if (shader->bo == NULL) {
70		shader->bo = (struct r600_resource*)
71			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
72		if (shader->bo == NULL) {
73			return -ENOMEM;
74		}
75		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
76		if (R600_BIG_ENDIAN) {
77			for (i = 0; i < rshader->bc.ndw; ++i) {
78				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
79			}
80		} else {
81			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
82		}
83		rctx->ws->buffer_unmap(shader->bo->cs_buf);
84	}
85	/* build state */
86	switch (rshader->processor_type) {
87	case TGSI_PROCESSOR_VERTEX:
88		if (rctx->chip_class >= EVERGREEN) {
89			evergreen_pipe_shader_vs(ctx, shader);
90		} else {
91			r600_pipe_shader_vs(ctx, shader);
92		}
93		break;
94	case TGSI_PROCESSOR_FRAGMENT:
95		if (rctx->chip_class >= EVERGREEN) {
96			evergreen_pipe_shader_ps(ctx, shader);
97		} else {
98			r600_pipe_shader_ps(ctx, shader);
99		}
100		break;
101	default:
102		return -EINVAL;
103	}
104	return 0;
105}
106
107static int r600_shader_from_tgsi(struct r600_screen *rscreen,
108				 struct r600_pipe_shader *pipeshader,
109				 struct r600_shader_key key);
110
111int r600_pipe_shader_create(struct pipe_context *ctx,
112			    struct r600_pipe_shader *shader,
113			    struct r600_shader_key key)
114{
115	static int dump_shaders = -1;
116	struct r600_context *rctx = (struct r600_context *)ctx;
117	struct r600_pipe_shader_selector *sel = shader->selector;
118	int r;
119
120	/* Would like some magic "get_bool_option_once" routine.
121	*/
122	if (dump_shaders == -1)
123		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
124
125	if (dump_shaders) {
126		fprintf(stderr, "--------------------------------------------------------------\n");
127		tgsi_dump(sel->tokens, 0);
128
129		if (sel->so.num_outputs) {
130			unsigned i;
131			fprintf(stderr, "STREAMOUT\n");
132			for (i = 0; i < sel->so.num_outputs; i++) {
133				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
134						sel->so.output[i].start_component;
135				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
136					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
137				        mask & 1 ? "x" : "_",
138				        (mask >> 1) & 1 ? "y" : "_",
139				        (mask >> 2) & 1 ? "z" : "_",
140				        (mask >> 3) & 1 ? "w" : "_");
141			}
142		}
143	}
144	r = r600_shader_from_tgsi(rctx->screen, shader, key);
145	if (r) {
146		R600_ERR("translation from TGSI failed !\n");
147		return r;
148	}
149	r = r600_bytecode_build(&shader->shader.bc);
150	if (r) {
151		R600_ERR("building bytecode failed !\n");
152		return r;
153	}
154	if (dump_shaders) {
155		r600_bytecode_dump(&shader->shader.bc);
156		fprintf(stderr, "______________________________________________________________\n");
157	}
158	return r600_pipe_shader(ctx, shader);
159}
160
161void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
162{
163	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
164	r600_bytecode_clear(&shader->shader.bc);
165}
166
167/*
168 * tgsi -> r600 shader
169 */
170struct r600_shader_tgsi_instruction;
171
172struct r600_shader_src {
173	unsigned				sel;
174	unsigned				swizzle[4];
175	unsigned				neg;
176	unsigned				abs;
177	unsigned				rel;
178	uint32_t				value[4];
179};
180
181struct r600_shader_ctx {
182	struct tgsi_shader_info			info;
183	struct tgsi_parse_context		parse;
184	const struct tgsi_token			*tokens;
185	unsigned				type;
186	unsigned				file_offset[TGSI_FILE_COUNT];
187	unsigned				temp_reg;
188	struct r600_shader_tgsi_instruction	*inst_info;
189	struct r600_bytecode			*bc;
190	struct r600_shader			*shader;
191	struct r600_shader_src			src[4];
192	uint32_t				*literals;
193	uint32_t				nliterals;
194	uint32_t				max_driver_temp_used;
195	boolean use_llvm;
196	/* needed for evergreen interpolation */
197	boolean                                 input_centroid;
198	boolean                                 input_linear;
199	boolean                                 input_perspective;
200	int					num_interp_gpr;
201	int					face_gpr;
202	int					colors_used;
203	boolean                 clip_vertex_write;
204	unsigned                cv_output;
205	int					fragcoord_input;
206	int					native_integers;
207};
208
209struct r600_shader_tgsi_instruction {
210	unsigned	tgsi_opcode;
211	unsigned	is_op3;
212	unsigned	r600_opcode;
213	int (*process)(struct r600_shader_ctx *ctx);
214};
215
216static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
217static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
218static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
219static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
220static int tgsi_else(struct r600_shader_ctx *ctx);
221static int tgsi_endif(struct r600_shader_ctx *ctx);
222static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
223static int tgsi_endloop(struct r600_shader_ctx *ctx);
224static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
225
226/*
227 * bytestream -> r600 shader
228 *
229 * These functions are used to transform the output of the LLVM backend into
230 * struct r600_bytecode.
231 */
232
233static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
234				unsigned char * bytes,	unsigned num_bytes);
235
236#ifdef HAVE_OPENCL
237int r600_compute_shader_create(struct pipe_context * ctx,
238	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
239{
240	struct r600_context *r600_ctx = (struct r600_context *)ctx;
241	unsigned char * bytes;
242	unsigned byte_count;
243	struct r600_shader_ctx shader_ctx;
244	unsigned dump = 0;
245
246	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
247		dump = 1;
248	}
249
250	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
251	shader_ctx.bc = bytecode;
252	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
253	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
254	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
255	if (shader_ctx.bc->chip_class == CAYMAN) {
256		cm_bytecode_add_cf_end(shader_ctx.bc);
257	}
258	r600_bytecode_build(shader_ctx.bc);
259	if (dump) {
260		r600_bytecode_dump(shader_ctx.bc);
261	}
262	free(bytes);
263	return 1;
264}
265
266#endif /* HAVE_OPENCL */
267
268static uint32_t i32_from_byte_stream(unsigned char * bytes,
269		unsigned * bytes_read)
270{
271	unsigned i;
272	uint32_t out = 0;
273	for (i = 0; i < 4; i++) {
274		out |= bytes[(*bytes_read)++] << (8 * i);
275	}
276	return out;
277}
278
279static unsigned r600_src_from_byte_stream(unsigned char * bytes,
280		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
281{
282	unsigned i;
283	unsigned sel0, sel1;
284	sel0 = bytes[bytes_read++];
285	sel1 = bytes[bytes_read++];
286	alu->src[src_idx].sel = sel0 | (sel1 << 8);
287	alu->src[src_idx].chan = bytes[bytes_read++];
288	alu->src[src_idx].neg = bytes[bytes_read++];
289	alu->src[src_idx].abs = bytes[bytes_read++];
290	alu->src[src_idx].rel = bytes[bytes_read++];
291	alu->src[src_idx].kc_bank = bytes[bytes_read++];
292	for (i = 0; i < 4; i++) {
293		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
294	}
295	return bytes_read;
296}
297
298static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
299				unsigned char * bytes, unsigned bytes_read)
300{
301	unsigned src_idx;
302	struct r600_bytecode_alu alu;
303	unsigned src_const_reg[3];
304	uint32_t word0, word1;
305
306	memset(&alu, 0, sizeof(alu));
307	for(src_idx = 0; src_idx < 3; src_idx++) {
308		unsigned i;
309		src_const_reg[src_idx] = bytes[bytes_read++];
310		for (i = 0; i < 4; i++) {
311			alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
312		}
313	}
314
315	word0 = i32_from_byte_stream(bytes, &bytes_read);
316	word1 = i32_from_byte_stream(bytes, &bytes_read);
317
318	switch(ctx->bc->chip_class) {
319	case R600:
320		r600_bytecode_alu_read(&alu, word0, word1);
321		break;
322	case R700:
323	case EVERGREEN:
324	case CAYMAN:
325		r700_bytecode_alu_read(&alu, word0, word1);
326		break;
327	}
328
329	for(src_idx = 0; src_idx < 3; src_idx++) {
330		if (src_const_reg[src_idx])
331			alu.src[src_idx].sel += 512;
332	}
333
334#if HAVE_LLVM < 0x0302
335	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
336	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
337	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
338	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
339		alu.update_pred = 1;
340		alu.dst.write = 0;
341		alu.src[1].sel = V_SQ_ALU_SRC_0;
342		alu.src[1].chan = 0;
343		alu.last = 1;
344	}
345#endif
346
347	if (alu.execute_mask) {
348		alu.pred_sel = 0;
349		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
350	} else {
351		r600_bytecode_add_alu(ctx->bc, &alu);
352	}
353
354	/* XXX: Handle other KILL instructions */
355	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
356		ctx->shader->uses_kill = 1;
357		/* XXX: This should be enforced in the LLVM backend. */
358		ctx->bc->force_add_cf = 1;
359	}
360	return bytes_read;
361}
362
363static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
364	unsigned pred_inst)
365{
366	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
367	fc_pushlevel(ctx, FC_IF);
368	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
369}
370
371static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
372			struct r600_bytecode_alu *alu, unsigned compare_opcode)
373{
374	unsigned opcode = TGSI_OPCODE_BRK;
375	if (ctx->bc->chip_class == CAYMAN)
376		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
377	else if (ctx->bc->chip_class >= EVERGREEN)
378		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
379	else
380		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
381	llvm_if(ctx, alu, compare_opcode);
382	tgsi_loop_brk_cont(ctx);
383	tgsi_endif(ctx);
384}
385
386static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
387				unsigned char * bytes, unsigned bytes_read)
388{
389	struct r600_bytecode_alu alu;
390	unsigned inst;
391	memset(&alu, 0, sizeof(alu));
392	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
393	inst = bytes[bytes_read++];
394	switch (inst) {
395	case 0: /* FC_IF */
396		llvm_if(ctx, &alu,
397			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
398		break;
399	case 1: /* FC_IF_INT */
400		llvm_if(ctx, &alu,
401			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
402		break;
403	case 2: /* FC_ELSE */
404		tgsi_else(ctx);
405		break;
406	case 3: /* FC_ENDIF */
407		tgsi_endif(ctx);
408		break;
409	case 4: /* FC_BGNLOOP */
410		tgsi_bgnloop(ctx);
411		break;
412	case 5: /* FC_ENDLOOP */
413		tgsi_endloop(ctx);
414		break;
415	case 6: /* FC_BREAK */
416		r600_break_from_byte_stream(ctx, &alu,
417			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
418		break;
419	case 7: /* FC_BREAK_NZ_INT */
420		r600_break_from_byte_stream(ctx, &alu,
421			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
422		break;
423	case 8: /* FC_CONTINUE */
424		{
425			unsigned opcode = TGSI_OPCODE_CONT;
426			if (ctx->bc->chip_class == CAYMAN) {
427				ctx->inst_info =
428					&cm_shader_tgsi_instruction[opcode];
429			} else if (ctx->bc->chip_class >= EVERGREEN) {
430				ctx->inst_info =
431					&eg_shader_tgsi_instruction[opcode];
432			} else {
433				ctx->inst_info =
434					&r600_shader_tgsi_instruction[opcode];
435			}
436			tgsi_loop_brk_cont(ctx);
437		}
438		break;
439	case 9: /* FC_BREAK_Z_INT */
440		r600_break_from_byte_stream(ctx, &alu,
441			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
442		break;
443	case 10: /* FC_BREAK_NZ */
444		r600_break_from_byte_stream(ctx, &alu,
445			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
446		break;
447	}
448
449	return bytes_read;
450}
451
452static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
453				unsigned char * bytes, unsigned bytes_read)
454{
455	struct r600_bytecode_tex tex;
456
457	tex.inst = bytes[bytes_read++];
458	tex.resource_id = bytes[bytes_read++];
459	tex.src_gpr = bytes[bytes_read++];
460	tex.src_rel = bytes[bytes_read++];
461	tex.dst_gpr = bytes[bytes_read++];
462	tex.dst_rel = bytes[bytes_read++];
463	tex.dst_sel_x = bytes[bytes_read++];
464	tex.dst_sel_y = bytes[bytes_read++];
465	tex.dst_sel_z = bytes[bytes_read++];
466	tex.dst_sel_w = bytes[bytes_read++];
467	tex.lod_bias = bytes[bytes_read++];
468	tex.coord_type_x = bytes[bytes_read++];
469	tex.coord_type_y = bytes[bytes_read++];
470	tex.coord_type_z = bytes[bytes_read++];
471	tex.coord_type_w = bytes[bytes_read++];
472	tex.offset_x = bytes[bytes_read++];
473	tex.offset_y = bytes[bytes_read++];
474	tex.offset_z = bytes[bytes_read++];
475	tex.sampler_id = bytes[bytes_read++];
476	tex.src_sel_x = bytes[bytes_read++];
477	tex.src_sel_y = bytes[bytes_read++];
478	tex.src_sel_z = bytes[bytes_read++];
479	tex.src_sel_w = bytes[bytes_read++];
480
481	r600_bytecode_add_tex(ctx->bc, &tex);
482
483	return bytes_read;
484}
485
486static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
487	unsigned char * bytes, unsigned bytes_read)
488{
489	struct r600_bytecode_vtx vtx;
490
491	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
492        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
493	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
494
495	memset(&vtx, 0, sizeof(vtx));
496
497	/* WORD0 */
498	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
499	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
500	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
501	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
502	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
503	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
504
505	/* WORD1 */
506	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
507	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
508	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
509	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
510	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
511	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
512	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
513	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
514	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
515	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
516
517	/* WORD 2*/
518	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
519	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
520
521	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
522		fprintf(stderr, "Error adding vtx\n");
523	}
524	/* Use the Texture Cache */
525	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
526	return bytes_read;
527}
528
529static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
530				unsigned char * bytes,	unsigned num_bytes)
531{
532	unsigned bytes_read = 0;
533	unsigned i, byte;
534	while (bytes_read < num_bytes) {
535		char inst_type = bytes[bytes_read++];
536		switch (inst_type) {
537		case 0:
538			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
539								bytes_read);
540			break;
541		case 1:
542			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
543								bytes_read);
544			break;
545		case 2:
546			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
547								bytes_read);
548			break;
549		case 3:
550			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
551			for (i = 0; i < 2; i++) {
552				for (byte = 0 ; byte < 4; byte++) {
553					ctx->bc->cf_last->isa[i] |=
554					(bytes[bytes_read++] << (byte * 8));
555				}
556			}
557			break;
558
559		case 4:
560			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
561								bytes_read);
562			break;
563		default:
564			/* XXX: Error here */
565			break;
566		}
567	}
568}
569
570/* End bytestream -> r600 shader functions*/
571
572static int tgsi_is_supported(struct r600_shader_ctx *ctx)
573{
574	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
575	int j;
576
577	if (i->Instruction.NumDstRegs > 1) {
578		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
579		return -EINVAL;
580	}
581	if (i->Instruction.Predicate) {
582		R600_ERR("predicate unsupported\n");
583		return -EINVAL;
584	}
585#if 0
586	if (i->Instruction.Label) {
587		R600_ERR("label unsupported\n");
588		return -EINVAL;
589	}
590#endif
591	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
592		if (i->Src[j].Register.Dimension) {
593			R600_ERR("unsupported src %d (dimension %d)\n", j,
594				 i->Src[j].Register.Dimension);
595			return -EINVAL;
596		}
597	}
598	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
599		if (i->Dst[j].Register.Dimension) {
600			R600_ERR("unsupported dst (dimension)\n");
601			return -EINVAL;
602		}
603	}
604	return 0;
605}
606
607static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
608{
609	int i, r;
610	struct r600_bytecode_alu alu;
611	int gpr = 0, base_chan = 0;
612	int ij_index = 0;
613
614	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
615		ij_index = 0;
616		if (ctx->shader->input[input].centroid)
617			ij_index++;
618	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
619		ij_index = 0;
620		/* if we have perspective add one */
621		if (ctx->input_perspective)  {
622			ij_index++;
623			/* if we have perspective centroid */
624			if (ctx->input_centroid)
625				ij_index++;
626		}
627		if (ctx->shader->input[input].centroid)
628			ij_index++;
629	}
630
631	/* work out gpr and base_chan from index */
632	gpr = ij_index / 2;
633	base_chan = (2 * (ij_index % 2)) + 1;
634
635	for (i = 0; i < 8; i++) {
636		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
637
638		if (i < 4)
639			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
640		else
641			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
642
643		if ((i > 1) && (i < 6)) {
644			alu.dst.sel = ctx->shader->input[input].gpr;
645			alu.dst.write = 1;
646		}
647
648		alu.dst.chan = i % 4;
649
650		alu.src[0].sel = gpr;
651		alu.src[0].chan = (base_chan - (i % 2));
652
653		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
654
655		alu.bank_swizzle_force = SQ_ALU_VEC_210;
656		if ((i % 4) == 3)
657			alu.last = 1;
658		r = r600_bytecode_add_alu(ctx->bc, &alu);
659		if (r)
660			return r;
661	}
662	return 0;
663}
664
665static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
666{
667	int i, r;
668	struct r600_bytecode_alu alu;
669
670	for (i = 0; i < 4; i++) {
671		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
672
673		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
674
675		alu.dst.sel = ctx->shader->input[input].gpr;
676		alu.dst.write = 1;
677
678		alu.dst.chan = i;
679
680		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
681		alu.src[0].chan = i;
682
683		if (i == 3)
684			alu.last = 1;
685		r = r600_bytecode_add_alu(ctx->bc, &alu);
686		if (r)
687			return r;
688	}
689	return 0;
690}
691
692/*
693 * Special export handling in shaders
694 *
695 * shader export ARRAY_BASE for EXPORT_POS:
696 * 60 is position
697 * 61 is misc vector
698 * 62, 63 are clip distance vectors
699 *
700 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
701 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
702 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
703 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
704 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
705 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
706 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
707 * exclusive from render target index)
708 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
709 *
710 *
711 * shader export ARRAY_BASE for EXPORT_PIXEL:
712 * 0-7 CB targets
713 * 61 computed Z vector
714 *
715 * The use of the values exported in the computed Z vector are controlled
716 * by DB_SHADER_CONTROL:
717 * Z_EXPORT_ENABLE - Z as a float in RED
718 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
719 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
720 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
721 * DB_SOURCE_FORMAT - export control restrictions
722 *
723 */
724
725
726/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
727static int r600_spi_sid(struct r600_shader_io * io)
728{
729	int index, name = io->name;
730
731	/* These params are handled differently, they don't need
732	 * semantic indices, so we'll use 0 for them.
733	 */
734	if (name == TGSI_SEMANTIC_POSITION ||
735		name == TGSI_SEMANTIC_PSIZE ||
736		name == TGSI_SEMANTIC_FACE)
737		index = 0;
738	else {
739		if (name == TGSI_SEMANTIC_GENERIC) {
740			/* For generic params simply use sid from tgsi */
741			index = io->sid;
742		} else {
743			/* For non-generic params - pack name and sid into 8 bits */
744			index = 0x80 | (name<<3) | (io->sid);
745		}
746
747		/* Make sure that all really used indices have nonzero value, so
748		 * we can just compare it to 0 later instead of comparing the name
749		 * with different values to detect special cases. */
750		index++;
751	}
752
753	return index;
754};
755
756/* turn input into interpolate on EG */
757static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
758{
759	int r = 0;
760
761	if (ctx->shader->input[index].spi_sid) {
762		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
763		if (!ctx->use_llvm) {
764			if (ctx->shader->input[index].interpolate > 0) {
765				r = evergreen_interp_alu(ctx, index);
766			} else {
767				r = evergreen_interp_flat(ctx, index);
768			}
769		}
770	}
771	return r;
772}
773
774static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
775{
776	struct r600_bytecode_alu alu;
777	int i, r;
778	int gpr_front = ctx->shader->input[front].gpr;
779	int gpr_back = ctx->shader->input[back].gpr;
780
781	for (i = 0; i < 4; i++) {
782		memset(&alu, 0, sizeof(alu));
783		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
784		alu.is_op3 = 1;
785		alu.dst.write = 1;
786		alu.dst.sel = gpr_front;
787		alu.src[0].sel = ctx->face_gpr;
788		alu.src[1].sel = gpr_front;
789		alu.src[2].sel = gpr_back;
790
791		alu.dst.chan = i;
792		alu.src[1].chan = i;
793		alu.src[2].chan = i;
794		alu.last = (i==3);
795
796		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
797			return r;
798	}
799
800	return 0;
801}
802
803static int tgsi_declaration(struct r600_shader_ctx *ctx)
804{
805	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
806	unsigned i;
807	int r;
808
809	switch (d->Declaration.File) {
810	case TGSI_FILE_INPUT:
811		i = ctx->shader->ninput++;
812		ctx->shader->input[i].name = d->Semantic.Name;
813		ctx->shader->input[i].sid = d->Semantic.Index;
814		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
815		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
816		ctx->shader->input[i].centroid = d->Interp.Centroid;
817		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
818		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
819			switch (ctx->shader->input[i].name) {
820			case TGSI_SEMANTIC_FACE:
821				ctx->face_gpr = ctx->shader->input[i].gpr;
822				break;
823			case TGSI_SEMANTIC_COLOR:
824				ctx->colors_used++;
825				break;
826			case TGSI_SEMANTIC_POSITION:
827				ctx->fragcoord_input = i;
828				break;
829			}
830			if (ctx->bc->chip_class >= EVERGREEN) {
831				if ((r = evergreen_interp_input(ctx, i)))
832					return r;
833			}
834		}
835		break;
836	case TGSI_FILE_OUTPUT:
837		i = ctx->shader->noutput++;
838		ctx->shader->output[i].name = d->Semantic.Name;
839		ctx->shader->output[i].sid = d->Semantic.Index;
840		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
841		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
842		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
843		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
844		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
845			switch (d->Semantic.Name) {
846			case TGSI_SEMANTIC_CLIPDIST:
847				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
848				break;
849			case TGSI_SEMANTIC_PSIZE:
850				ctx->shader->vs_out_misc_write = 1;
851				ctx->shader->vs_out_point_size = 1;
852				break;
853			case TGSI_SEMANTIC_CLIPVERTEX:
854				ctx->clip_vertex_write = TRUE;
855				ctx->cv_output = i;
856				break;
857			}
858		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
859			switch (d->Semantic.Name) {
860			case TGSI_SEMANTIC_COLOR:
861				ctx->shader->nr_ps_max_color_exports++;
862				break;
863			}
864		}
865		break;
866	case TGSI_FILE_CONSTANT:
867	case TGSI_FILE_TEMPORARY:
868	case TGSI_FILE_SAMPLER:
869	case TGSI_FILE_ADDRESS:
870		break;
871
872	case TGSI_FILE_SYSTEM_VALUE:
873		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
874			if (!ctx->native_integers) {
875				struct r600_bytecode_alu alu;
876				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
877
878				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
879				alu.src[0].sel = 0;
880				alu.src[0].chan = 3;
881
882				alu.dst.sel = 0;
883				alu.dst.chan = 3;
884				alu.dst.write = 1;
885				alu.last = 1;
886
887				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
888					return r;
889			}
890			break;
891		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
892			break;
893	default:
894		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
895		return -EINVAL;
896	}
897	return 0;
898}
899
900static int r600_get_temp(struct r600_shader_ctx *ctx)
901{
902	return ctx->temp_reg + ctx->max_driver_temp_used++;
903}
904
905/*
906 * for evergreen we need to scan the shader to find the number of GPRs we need to
907 * reserve for interpolation.
908 *
909 * we need to know if we are going to emit
910 * any centroid inputs
911 * if perspective and linear are required
912*/
913static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
914{
915	int i;
916	int num_baryc;
917
918	ctx->input_linear = FALSE;
919	ctx->input_perspective = FALSE;
920	ctx->input_centroid = FALSE;
921	ctx->num_interp_gpr = 1;
922
923	/* any centroid inputs */
924	for (i = 0; i < ctx->info.num_inputs; i++) {
925		/* skip position/face */
926		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
927		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
928			continue;
929		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
930			ctx->input_linear = TRUE;
931		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
932			ctx->input_perspective = TRUE;
933		if (ctx->info.input_centroid[i])
934			ctx->input_centroid = TRUE;
935	}
936
937	num_baryc = 0;
938	/* ignoring sample for now */
939	if (ctx->input_perspective)
940		num_baryc++;
941	if (ctx->input_linear)
942		num_baryc++;
943	if (ctx->input_centroid)
944		num_baryc *= 2;
945
946	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
947
948	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
949	return ctx->num_interp_gpr;
950}
951
952static void tgsi_src(struct r600_shader_ctx *ctx,
953		     const struct tgsi_full_src_register *tgsi_src,
954		     struct r600_shader_src *r600_src)
955{
956	memset(r600_src, 0, sizeof(*r600_src));
957	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
958	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
959	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
960	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
961	r600_src->neg = tgsi_src->Register.Negate;
962	r600_src->abs = tgsi_src->Register.Absolute;
963
964	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
965		int index;
966		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
967			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
968			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
969
970			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
971			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
972			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
973				return;
974		}
975		index = tgsi_src->Register.Index;
976		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
977		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
978	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
979		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
980			r600_src->swizzle[0] = 3;
981			r600_src->swizzle[1] = 3;
982			r600_src->swizzle[2] = 3;
983			r600_src->swizzle[3] = 3;
984			r600_src->sel = 0;
985		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
986			r600_src->swizzle[0] = 0;
987			r600_src->swizzle[1] = 0;
988			r600_src->swizzle[2] = 0;
989			r600_src->swizzle[3] = 0;
990			r600_src->sel = 0;
991		}
992	} else {
993		if (tgsi_src->Register.Indirect)
994			r600_src->rel = V_SQ_REL_RELATIVE;
995		r600_src->sel = tgsi_src->Register.Index;
996		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
997	}
998}
999
1000static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
1001{
1002	struct r600_bytecode_vtx vtx;
1003	unsigned int ar_reg;
1004	int r;
1005
1006	if (offset) {
1007		struct r600_bytecode_alu alu;
1008
1009		memset(&alu, 0, sizeof(alu));
1010
1011		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
1012		alu.src[0].sel = ctx->bc->ar_reg;
1013
1014		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1015		alu.src[1].value = offset;
1016
1017		alu.dst.sel = dst_reg;
1018		alu.dst.write = 1;
1019		alu.last = 1;
1020
1021		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1022			return r;
1023
1024		ar_reg = dst_reg;
1025	} else {
1026		ar_reg = ctx->bc->ar_reg;
1027	}
1028
1029	memset(&vtx, 0, sizeof(vtx));
1030	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1031	vtx.src_gpr = ar_reg;
1032	vtx.mega_fetch_count = 16;
1033	vtx.dst_gpr = dst_reg;
1034	vtx.dst_sel_x = 0;		/* SEL_X */
1035	vtx.dst_sel_y = 1;		/* SEL_Y */
1036	vtx.dst_sel_z = 2;		/* SEL_Z */
1037	vtx.dst_sel_w = 3;		/* SEL_W */
1038	vtx.data_format = FMT_32_32_32_32_FLOAT;
1039	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1040	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1041	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1042	vtx.endian = r600_endian_swap(32);
1043
1044	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1045		return r;
1046
1047	return 0;
1048}
1049
1050static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1051{
1052	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1053	struct r600_bytecode_alu alu;
1054	int i, j, k, nconst, r;
1055
1056	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1057		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1058			nconst++;
1059		}
1060		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1061	}
1062	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1063		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1064			continue;
1065		}
1066
1067		if (ctx->src[i].rel) {
1068			int treg = r600_get_temp(ctx);
1069			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1070				return r;
1071
1072			ctx->src[i].sel = treg;
1073			ctx->src[i].rel = 0;
1074			j--;
1075		} else if (j > 0) {
1076			int treg = r600_get_temp(ctx);
1077			for (k = 0; k < 4; k++) {
1078				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1079				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1080				alu.src[0].sel = ctx->src[i].sel;
1081				alu.src[0].chan = k;
1082				alu.src[0].rel = ctx->src[i].rel;
1083				alu.dst.sel = treg;
1084				alu.dst.chan = k;
1085				alu.dst.write = 1;
1086				if (k == 3)
1087					alu.last = 1;
1088				r = r600_bytecode_add_alu(ctx->bc, &alu);
1089				if (r)
1090					return r;
1091			}
1092			ctx->src[i].sel = treg;
1093			ctx->src[i].rel =0;
1094			j--;
1095		}
1096	}
1097	return 0;
1098}
1099
1100/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1101static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1102{
1103	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1104	struct r600_bytecode_alu alu;
1105	int i, j, k, nliteral, r;
1106
1107	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1108		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1109			nliteral++;
1110		}
1111	}
1112	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1113		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1114			int treg = r600_get_temp(ctx);
1115			for (k = 0; k < 4; k++) {
1116				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1117				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1118				alu.src[0].sel = ctx->src[i].sel;
1119				alu.src[0].chan = k;
1120				alu.src[0].value = ctx->src[i].value[k];
1121				alu.dst.sel = treg;
1122				alu.dst.chan = k;
1123				alu.dst.write = 1;
1124				if (k == 3)
1125					alu.last = 1;
1126				r = r600_bytecode_add_alu(ctx->bc, &alu);
1127				if (r)
1128					return r;
1129			}
1130			ctx->src[i].sel = treg;
1131			j--;
1132		}
1133	}
1134	return 0;
1135}
1136
1137static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1138{
1139	int i, r, count = ctx->shader->ninput;
1140
1141	for (i = 0; i < count; i++) {
1142		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1143			unsigned back_facing_reg = ctx->shader->input[i].potential_back_facing_reg;
1144			if (ctx->bc->chip_class >= EVERGREEN) {
1145				if ((r = evergreen_interp_input(ctx, back_facing_reg)))
1146					return r;
1147			}
1148
1149			if (!ctx->use_llvm) {
1150				r = select_twoside_color(ctx, i, back_facing_reg);
1151				if (r)
1152					return r;
1153			}
1154		}
1155	}
1156	return 0;
1157}
1158
1159static int r600_shader_from_tgsi(struct r600_screen *rscreen,
1160				 struct r600_pipe_shader *pipeshader,
1161				 struct r600_shader_key key)
1162{
1163	struct r600_shader *shader = &pipeshader->shader;
1164	struct tgsi_token *tokens = pipeshader->selector->tokens;
1165	struct pipe_stream_output_info so = pipeshader->selector->so;
1166	struct tgsi_full_immediate *immediate;
1167	struct tgsi_full_property *property;
1168	struct r600_shader_ctx ctx;
1169	struct r600_bytecode_output output[32];
1170	unsigned output_done, noutput;
1171	unsigned opcode;
1172	int i, j, k, r = 0;
1173	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1174	/* Declarations used by llvm code */
1175	bool use_llvm = false;
1176	unsigned char * inst_bytes = NULL;
1177	unsigned inst_byte_count = 0;
1178
1179#ifdef R600_USE_LLVM
1180	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1181#endif
1182	ctx.bc = &shader->bc;
1183	ctx.shader = shader;
1184	ctx.native_integers = true;
1185
1186	r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family,
1187			   rscreen->msaa_texture_support);
1188	ctx.tokens = tokens;
1189	tgsi_scan_shader(tokens, &ctx.info);
1190	tgsi_parse_init(&ctx.parse, tokens);
1191	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1192	shader->processor_type = ctx.type;
1193	ctx.bc->type = shader->processor_type;
1194
1195	ctx.face_gpr = -1;
1196	ctx.fragcoord_input = -1;
1197	ctx.colors_used = 0;
1198	ctx.clip_vertex_write = 0;
1199
1200	shader->nr_ps_color_exports = 0;
1201	shader->nr_ps_max_color_exports = 0;
1202
1203	shader->two_side = key.color_two_side;
1204
1205	/* register allocations */
1206	/* Values [0,127] correspond to GPR[0..127].
1207	 * Values [128,159] correspond to constant buffer bank 0
1208	 * Values [160,191] correspond to constant buffer bank 1
1209	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1210	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1211	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1212	 * Other special values are shown in the list below.
1213	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1214	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1215	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1216	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1217	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1218	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1219	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1220	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1221	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1222	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1223	 * 254	SQ_ALU_SRC_PV: previous vector result.
1224	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1225	 */
1226	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1227		ctx.file_offset[i] = 0;
1228	}
1229	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1230		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1231		if (ctx.bc->chip_class >= EVERGREEN) {
1232			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1233		} else {
1234			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1235		}
1236	}
1237	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1238		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1239	}
1240
1241#ifdef R600_USE_LLVM
1242	if (use_llvm && ctx.info.indirect_files) {
1243		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1244				"indirect adressing.  Falling back to TGSI "
1245				"backend.\n");
1246		use_llvm = 0;
1247	}
1248#endif
1249	ctx.use_llvm = use_llvm;
1250
1251	if (use_llvm) {
1252		ctx.file_offset[TGSI_FILE_OUTPUT] =
1253			ctx.file_offset[TGSI_FILE_INPUT];
1254	} else {
1255	   ctx.file_offset[TGSI_FILE_OUTPUT] =
1256			ctx.file_offset[TGSI_FILE_INPUT] +
1257			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1258	}
1259	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1260						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1261
1262	/* Outside the GPR range. This will be translated to one of the
1263	 * kcache banks later. */
1264	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1265
1266	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1267	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1268			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1269	ctx.temp_reg = ctx.bc->ar_reg + 1;
1270
1271	ctx.nliterals = 0;
1272	ctx.literals = NULL;
1273	shader->fs_write_all = FALSE;
1274	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1275		tgsi_parse_token(&ctx.parse);
1276		switch (ctx.parse.FullToken.Token.Type) {
1277		case TGSI_TOKEN_TYPE_IMMEDIATE:
1278			immediate = &ctx.parse.FullToken.FullImmediate;
1279			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1280			if(ctx.literals == NULL) {
1281				r = -ENOMEM;
1282				goto out_err;
1283			}
1284			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1285			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1286			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1287			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1288			ctx.nliterals++;
1289			break;
1290		case TGSI_TOKEN_TYPE_DECLARATION:
1291			r = tgsi_declaration(&ctx);
1292			if (r)
1293				goto out_err;
1294			break;
1295		case TGSI_TOKEN_TYPE_INSTRUCTION:
1296			break;
1297		case TGSI_TOKEN_TYPE_PROPERTY:
1298			property = &ctx.parse.FullToken.FullProperty;
1299			switch (property->Property.PropertyName) {
1300			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1301				if (property->u[0].Data == 1)
1302					shader->fs_write_all = TRUE;
1303				break;
1304			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1305				/* we don't need this one */
1306				break;
1307			}
1308			break;
1309		default:
1310			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1311			r = -EINVAL;
1312			goto out_err;
1313		}
1314	}
1315
1316	/* Process two side if needed */
1317	if (shader->two_side && ctx.colors_used) {
1318		int i, count = ctx.shader->ninput;
1319		unsigned next_lds_loc = ctx.shader->nlds;
1320
1321		/* additional inputs will be allocated right after the existing inputs,
1322		 * we won't need them after the color selection, so we don't need to
1323		 * reserve these gprs for the rest of the shader code and to adjust
1324		 * output offsets etc. */
1325		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1326				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1327
1328		if (ctx.face_gpr == -1) {
1329			i = ctx.shader->ninput++;
1330			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1331			ctx.shader->input[i].spi_sid = 0;
1332			ctx.shader->input[i].gpr = gpr++;
1333			ctx.face_gpr = ctx.shader->input[i].gpr;
1334		}
1335
1336		for (i = 0; i < count; i++) {
1337			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1338				int ni = ctx.shader->ninput++;
1339				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1340				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1341				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1342				ctx.shader->input[ni].gpr = gpr++;
1343				// TGSI to LLVM needs to know the lds position of inputs.
1344				// Non LLVM path computes it later (in process_twoside_color)
1345				ctx.shader->input[ni].lds_pos = next_lds_loc++;
1346				ctx.shader->input[i].potential_back_facing_reg = ni;
1347			}
1348		}
1349	}
1350
1351/* LLVM backend setup */
1352#ifdef R600_USE_LLVM
1353	if (use_llvm) {
1354		struct radeon_llvm_context radeon_llvm_ctx;
1355		LLVMModuleRef mod;
1356		unsigned dump = 0;
1357		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1358		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1359		radeon_llvm_ctx.type = ctx.type;
1360		radeon_llvm_ctx.two_side = shader->two_side;
1361		radeon_llvm_ctx.face_input = ctx.face_gpr;
1362		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1363		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1364		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1365		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1366			dump = 1;
1367		}
1368		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1369							rscreen->family, dump)) {
1370			FREE(inst_bytes);
1371			radeon_llvm_dispose(&radeon_llvm_ctx);
1372			use_llvm = 0;
1373			fprintf(stderr, "R600 LLVM backend failed to compile "
1374				"shader.  Falling back to TGSI\n");
1375		} else {
1376			ctx.file_offset[TGSI_FILE_OUTPUT] =
1377					ctx.file_offset[TGSI_FILE_INPUT];
1378		}
1379		radeon_llvm_dispose(&radeon_llvm_ctx);
1380	}
1381#endif
1382/* End of LLVM backend setup */
1383
1384	if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN)
1385		shader->nr_ps_max_color_exports = 8;
1386
1387	if (ctx.fragcoord_input >= 0 && !use_llvm) {
1388		if (ctx.bc->chip_class == CAYMAN) {
1389			for (j = 0 ; j < 4; j++) {
1390				struct r600_bytecode_alu alu;
1391				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1392				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1393				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1394				alu.src[0].chan = 3;
1395
1396				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1397				alu.dst.chan = j;
1398				alu.dst.write = (j == 3);
1399				alu.last = 1;
1400				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1401					return r;
1402			}
1403		} else {
1404			struct r600_bytecode_alu alu;
1405			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1406			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1407			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1408			alu.src[0].chan = 3;
1409
1410			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1411			alu.dst.chan = 3;
1412			alu.dst.write = 1;
1413			alu.last = 1;
1414			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1415				return r;
1416		}
1417	}
1418
1419	if (shader->two_side && ctx.colors_used) {
1420		if ((r = process_twoside_color_inputs(&ctx)))
1421			return r;
1422	}
1423
1424	tgsi_parse_init(&ctx.parse, tokens);
1425	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1426		tgsi_parse_token(&ctx.parse);
1427		switch (ctx.parse.FullToken.Token.Type) {
1428		case TGSI_TOKEN_TYPE_INSTRUCTION:
1429			if (use_llvm) {
1430				continue;
1431			}
1432			r = tgsi_is_supported(&ctx);
1433			if (r)
1434				goto out_err;
1435			ctx.max_driver_temp_used = 0;
1436			/* reserve first tmp for everyone */
1437			r600_get_temp(&ctx);
1438
1439			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1440			if ((r = tgsi_split_constant(&ctx)))
1441				goto out_err;
1442			if ((r = tgsi_split_literal_constant(&ctx)))
1443				goto out_err;
1444			if (ctx.bc->chip_class == CAYMAN)
1445				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1446			else if (ctx.bc->chip_class >= EVERGREEN)
1447				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1448			else
1449				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1450			r = ctx.inst_info->process(&ctx);
1451			if (r)
1452				goto out_err;
1453			break;
1454		default:
1455			break;
1456		}
1457	}
1458
1459	/* Get instructions if we are using the LLVM backend. */
1460	if (use_llvm) {
1461		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1462		FREE(inst_bytes);
1463	}
1464
1465	noutput = shader->noutput;
1466
1467	if (ctx.clip_vertex_write) {
1468		/* need to convert a clipvertex write into clipdistance writes and not export
1469		   the clip vertex anymore */
1470
1471		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1472		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1473		shader->output[noutput].gpr = ctx.temp_reg;
1474		noutput++;
1475		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1476		shader->output[noutput].gpr = ctx.temp_reg+1;
1477		noutput++;
1478
1479		/* reset spi_sid for clipvertex output to avoid confusing spi */
1480		shader->output[ctx.cv_output].spi_sid = 0;
1481
1482		shader->clip_dist_write = 0xFF;
1483
1484		for (i = 0; i < 8; i++) {
1485			int oreg = i >> 2;
1486			int ochan = i & 3;
1487
1488			for (j = 0; j < 4; j++) {
1489				struct r600_bytecode_alu alu;
1490				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1491				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1492				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1493				alu.src[0].chan = j;
1494
1495				alu.src[1].sel = 512 + i;
1496				alu.src[1].kc_bank = 1;
1497				alu.src[1].chan = j;
1498
1499				alu.dst.sel = ctx.temp_reg + oreg;
1500				alu.dst.chan = j;
1501				alu.dst.write = (j == ochan);
1502				if (j == 3)
1503					alu.last = 1;
1504				r = r600_bytecode_add_alu(ctx.bc, &alu);
1505				if (r)
1506					return r;
1507			}
1508		}
1509	}
1510
1511	/* Add stream outputs. */
1512	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1513		for (i = 0; i < so.num_outputs; i++) {
1514			struct r600_bytecode_output output;
1515
1516			if (so.output[i].output_buffer >= 4) {
1517				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1518					 so.output[i].output_buffer);
1519				r = -EINVAL;
1520				goto out_err;
1521			}
1522			if (so.output[i].dst_offset < so.output[i].start_component) {
1523			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1524			   r = -EINVAL;
1525			   goto out_err;
1526			}
1527
1528			memset(&output, 0, sizeof(struct r600_bytecode_output));
1529			output.gpr = shader->output[so.output[i].register_index].gpr;
1530			output.elem_size = 0;
1531			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1532			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1533			output.burst_count = 1;
1534			output.barrier = 1;
1535			/* array_size is an upper limit for the burst_count
1536			 * with MEM_STREAM instructions */
1537			output.array_size = 0xFFF;
1538			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1539			if (ctx.bc->chip_class >= EVERGREEN) {
1540				switch (so.output[i].output_buffer) {
1541				case 0:
1542					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1543					break;
1544				case 1:
1545					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1546					break;
1547				case 2:
1548					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1549					break;
1550				case 3:
1551					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1552					break;
1553				}
1554			} else {
1555				switch (so.output[i].output_buffer) {
1556				case 0:
1557					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1558					break;
1559				case 1:
1560					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1561					break;
1562				case 2:
1563					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1564					break;
1565				case 3:
1566					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1567					break;
1568				}
1569			}
1570			r = r600_bytecode_add_output(ctx.bc, &output);
1571			if (r)
1572				goto out_err;
1573		}
1574	}
1575
1576	/* export output */
1577	for (i = 0, j = 0; i < noutput; i++, j++) {
1578		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1579		output[j].gpr = shader->output[i].gpr;
1580		output[j].elem_size = 3;
1581		output[j].swizzle_x = 0;
1582		output[j].swizzle_y = 1;
1583		output[j].swizzle_z = 2;
1584		output[j].swizzle_w = 3;
1585		output[j].burst_count = 1;
1586		output[j].barrier = 1;
1587		output[j].type = -1;
1588		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1589		switch (ctx.type) {
1590		case TGSI_PROCESSOR_VERTEX:
1591			switch (shader->output[i].name) {
1592			case TGSI_SEMANTIC_POSITION:
1593				output[j].array_base = next_pos_base++;
1594				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1595				break;
1596
1597			case TGSI_SEMANTIC_PSIZE:
1598				output[j].array_base = next_pos_base++;
1599				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1600				break;
1601			case TGSI_SEMANTIC_CLIPVERTEX:
1602				j--;
1603				break;
1604			case TGSI_SEMANTIC_CLIPDIST:
1605				output[j].array_base = next_pos_base++;
1606				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1607				/* spi_sid is 0 for clipdistance outputs that were generated
1608				 * for clipvertex - we don't need to pass them to PS */
1609				if (shader->output[i].spi_sid) {
1610					j++;
1611					/* duplicate it as PARAM to pass to the pixel shader */
1612					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1613					output[j].array_base = next_param_base++;
1614					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1615				}
1616				break;
1617			case TGSI_SEMANTIC_FOG:
1618				output[j].swizzle_y = 4; /* 0 */
1619				output[j].swizzle_z = 4; /* 0 */
1620				output[j].swizzle_w = 5; /* 1 */
1621				break;
1622			}
1623			break;
1624		case TGSI_PROCESSOR_FRAGMENT:
1625			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1626				/* never export more colors than the number of CBs */
1627				if (next_pixel_base && next_pixel_base >= key.nr_cbufs) {
1628					/* skip export */
1629					j--;
1630					continue;
1631				}
1632				output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1633				output[j].array_base = next_pixel_base++;
1634				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1635				shader->nr_ps_color_exports++;
1636				if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
1637					for (k = 1; k < key.nr_cbufs; k++) {
1638						j++;
1639						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1640						output[j].gpr = shader->output[i].gpr;
1641						output[j].elem_size = 3;
1642						output[j].swizzle_x = 0;
1643						output[j].swizzle_y = 1;
1644						output[j].swizzle_z = 2;
1645						output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1646						output[j].burst_count = 1;
1647						output[j].barrier = 1;
1648						output[j].array_base = next_pixel_base++;
1649						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1650						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1651						shader->nr_ps_color_exports++;
1652					}
1653				}
1654			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1655				output[j].array_base = 61;
1656				output[j].swizzle_x = 2;
1657				output[j].swizzle_y = 7;
1658				output[j].swizzle_z = output[j].swizzle_w = 7;
1659				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1660			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1661				output[j].array_base = 61;
1662				output[j].swizzle_x = 7;
1663				output[j].swizzle_y = 1;
1664				output[j].swizzle_z = output[j].swizzle_w = 7;
1665				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1666			} else {
1667				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1668				r = -EINVAL;
1669				goto out_err;
1670			}
1671			break;
1672		default:
1673			R600_ERR("unsupported processor type %d\n", ctx.type);
1674			r = -EINVAL;
1675			goto out_err;
1676		}
1677
1678		if (output[j].type==-1) {
1679			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1680			output[j].array_base = next_param_base++;
1681		}
1682	}
1683
1684	/* add fake param output for vertex shader if no param is exported */
1685	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1686			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1687			output[j].gpr = 0;
1688			output[j].elem_size = 3;
1689			output[j].swizzle_x = 7;
1690			output[j].swizzle_y = 7;
1691			output[j].swizzle_z = 7;
1692			output[j].swizzle_w = 7;
1693			output[j].burst_count = 1;
1694			output[j].barrier = 1;
1695			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1696			output[j].array_base = 0;
1697			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1698			j++;
1699	}
1700
1701	/* add fake pixel export */
1702	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1703		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1704		output[j].gpr = 0;
1705		output[j].elem_size = 3;
1706		output[j].swizzle_x = 7;
1707		output[j].swizzle_y = 7;
1708		output[j].swizzle_z = 7;
1709		output[j].swizzle_w = 7;
1710		output[j].burst_count = 1;
1711		output[j].barrier = 1;
1712		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1713		output[j].array_base = 0;
1714		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1715		j++;
1716	}
1717
1718	noutput = j;
1719
1720	/* set export done on last export of each type */
1721	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1722		if (ctx.bc->chip_class < CAYMAN) {
1723			if (i == (noutput - 1)) {
1724				output[i].end_of_program = 1;
1725			}
1726		}
1727		if (!(output_done & (1 << output[i].type))) {
1728			output_done |= (1 << output[i].type);
1729			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1730		}
1731	}
1732	/* add output to bytecode */
1733	for (i = 0; i < noutput; i++) {
1734		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1735		if (r)
1736			goto out_err;
1737	}
1738	/* add program end */
1739	if (ctx.bc->chip_class == CAYMAN)
1740		cm_bytecode_add_cf_end(ctx.bc);
1741
1742	/* check GPR limit - we have 124 = 128 - 4
1743	 * (4 are reserved as alu clause temporary registers) */
1744	if (ctx.bc->ngpr > 124) {
1745		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1746		r = -ENOMEM;
1747		goto out_err;
1748	}
1749
1750	free(ctx.literals);
1751	tgsi_parse_free(&ctx.parse);
1752	return 0;
1753out_err:
1754	free(ctx.literals);
1755	tgsi_parse_free(&ctx.parse);
1756	return r;
1757}
1758
1759static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1760{
1761	R600_ERR("%s tgsi opcode unsupported\n",
1762		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1763	return -EINVAL;
1764}
1765
1766static int tgsi_end(struct r600_shader_ctx *ctx)
1767{
1768	return 0;
1769}
1770
1771static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1772			const struct r600_shader_src *shader_src,
1773			unsigned chan)
1774{
1775	bc_src->sel = shader_src->sel;
1776	bc_src->chan = shader_src->swizzle[chan];
1777	bc_src->neg = shader_src->neg;
1778	bc_src->abs = shader_src->abs;
1779	bc_src->rel = shader_src->rel;
1780	bc_src->value = shader_src->value[bc_src->chan];
1781}
1782
1783static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1784{
1785	bc_src->abs = 1;
1786	bc_src->neg = 0;
1787}
1788
1789static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1790{
1791	bc_src->neg = !bc_src->neg;
1792}
1793
1794static void tgsi_dst(struct r600_shader_ctx *ctx,
1795		     const struct tgsi_full_dst_register *tgsi_dst,
1796		     unsigned swizzle,
1797		     struct r600_bytecode_alu_dst *r600_dst)
1798{
1799	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1800
1801	r600_dst->sel = tgsi_dst->Register.Index;
1802	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1803	r600_dst->chan = swizzle;
1804	r600_dst->write = 1;
1805	if (tgsi_dst->Register.Indirect)
1806		r600_dst->rel = V_SQ_REL_RELATIVE;
1807	if (inst->Instruction.Saturate) {
1808		r600_dst->clamp = 1;
1809	}
1810}
1811
1812static int tgsi_last_instruction(unsigned writemask)
1813{
1814	int i, lasti = 0;
1815
1816	for (i = 0; i < 4; i++) {
1817		if (writemask & (1 << i)) {
1818			lasti = i;
1819		}
1820	}
1821	return lasti;
1822}
1823
1824static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1825{
1826	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1827	struct r600_bytecode_alu alu;
1828	int i, j, r;
1829	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1830
1831	for (i = 0; i < lasti + 1; i++) {
1832		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1833			continue;
1834
1835		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1836		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1837
1838		alu.inst = ctx->inst_info->r600_opcode;
1839		if (!swap) {
1840			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1841				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1842			}
1843		} else {
1844			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1845			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1846		}
1847		/* handle some special cases */
1848		switch (ctx->inst_info->tgsi_opcode) {
1849		case TGSI_OPCODE_SUB:
1850			r600_bytecode_src_toggle_neg(&alu.src[1]);
1851			break;
1852		case TGSI_OPCODE_ABS:
1853			r600_bytecode_src_set_abs(&alu.src[0]);
1854			break;
1855		default:
1856			break;
1857		}
1858		if (i == lasti || trans_only) {
1859			alu.last = 1;
1860		}
1861		r = r600_bytecode_add_alu(ctx->bc, &alu);
1862		if (r)
1863			return r;
1864	}
1865	return 0;
1866}
1867
1868static int tgsi_op2(struct r600_shader_ctx *ctx)
1869{
1870	return tgsi_op2_s(ctx, 0, 0);
1871}
1872
1873static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1874{
1875	return tgsi_op2_s(ctx, 1, 0);
1876}
1877
1878static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1879{
1880	return tgsi_op2_s(ctx, 0, 1);
1881}
1882
1883static int tgsi_ineg(struct r600_shader_ctx *ctx)
1884{
1885	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1886	struct r600_bytecode_alu alu;
1887	int i, r;
1888	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1889
1890	for (i = 0; i < lasti + 1; i++) {
1891
1892		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1893			continue;
1894		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1895		alu.inst = ctx->inst_info->r600_opcode;
1896
1897		alu.src[0].sel = V_SQ_ALU_SRC_0;
1898
1899		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1900
1901		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1902
1903		if (i == lasti) {
1904			alu.last = 1;
1905		}
1906		r = r600_bytecode_add_alu(ctx->bc, &alu);
1907		if (r)
1908			return r;
1909	}
1910	return 0;
1911
1912}
1913
1914static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1915{
1916	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1917	int i, j, r;
1918	struct r600_bytecode_alu alu;
1919	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1920
1921	for (i = 0 ; i < last_slot; i++) {
1922		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1923		alu.inst = ctx->inst_info->r600_opcode;
1924		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1925			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1926
1927			/* RSQ should take the absolute value of src */
1928			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
1929				r600_bytecode_src_set_abs(&alu.src[j]);
1930			}
1931		}
1932		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1933		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1934
1935		if (i == last_slot - 1)
1936			alu.last = 1;
1937		r = r600_bytecode_add_alu(ctx->bc, &alu);
1938		if (r)
1939			return r;
1940	}
1941	return 0;
1942}
1943
1944static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1945{
1946	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1947	int i, j, k, r;
1948	struct r600_bytecode_alu alu;
1949	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1950	for (k = 0; k < last_slot; k++) {
1951		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1952			continue;
1953
1954		for (i = 0 ; i < 4; i++) {
1955			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1956			alu.inst = ctx->inst_info->r600_opcode;
1957			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1958				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1959			}
1960			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1961			alu.dst.write = (i == k);
1962			if (i == 3)
1963				alu.last = 1;
1964			r = r600_bytecode_add_alu(ctx->bc, &alu);
1965			if (r)
1966				return r;
1967		}
1968	}
1969	return 0;
1970}
1971
1972/*
1973 * r600 - trunc to -PI..PI range
1974 * r700 - normalize by dividing by 2PI
1975 * see fdo bug 27901
1976 */
1977static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1978{
1979	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1980	static float double_pi = 3.1415926535 * 2;
1981	static float neg_pi = -3.1415926535;
1982
1983	int r;
1984	struct r600_bytecode_alu alu;
1985
1986	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1987	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1988	alu.is_op3 = 1;
1989
1990	alu.dst.chan = 0;
1991	alu.dst.sel = ctx->temp_reg;
1992	alu.dst.write = 1;
1993
1994	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1995
1996	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1997	alu.src[1].chan = 0;
1998	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1999	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2000	alu.src[2].chan = 0;
2001	alu.last = 1;
2002	r = r600_bytecode_add_alu(ctx->bc, &alu);
2003	if (r)
2004		return r;
2005
2006	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2007	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
2008
2009	alu.dst.chan = 0;
2010	alu.dst.sel = ctx->temp_reg;
2011	alu.dst.write = 1;
2012
2013	alu.src[0].sel = ctx->temp_reg;
2014	alu.src[0].chan = 0;
2015	alu.last = 1;
2016	r = r600_bytecode_add_alu(ctx->bc, &alu);
2017	if (r)
2018		return r;
2019
2020	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2021	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2022	alu.is_op3 = 1;
2023
2024	alu.dst.chan = 0;
2025	alu.dst.sel = ctx->temp_reg;
2026	alu.dst.write = 1;
2027
2028	alu.src[0].sel = ctx->temp_reg;
2029	alu.src[0].chan = 0;
2030
2031	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2032	alu.src[1].chan = 0;
2033	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2034	alu.src[2].chan = 0;
2035
2036	if (ctx->bc->chip_class == R600) {
2037		alu.src[1].value = *(uint32_t *)&double_pi;
2038		alu.src[2].value = *(uint32_t *)&neg_pi;
2039	} else {
2040		alu.src[1].sel = V_SQ_ALU_SRC_1;
2041		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2042		alu.src[2].neg = 1;
2043	}
2044
2045	alu.last = 1;
2046	r = r600_bytecode_add_alu(ctx->bc, &alu);
2047	if (r)
2048		return r;
2049	return 0;
2050}
2051
2052static int cayman_trig(struct r600_shader_ctx *ctx)
2053{
2054	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2055	struct r600_bytecode_alu alu;
2056	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2057	int i, r;
2058
2059	r = tgsi_setup_trig(ctx);
2060	if (r)
2061		return r;
2062
2063
2064	for (i = 0; i < last_slot; i++) {
2065		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2066		alu.inst = ctx->inst_info->r600_opcode;
2067		alu.dst.chan = i;
2068
2069		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2070		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2071
2072		alu.src[0].sel = ctx->temp_reg;
2073		alu.src[0].chan = 0;
2074		if (i == last_slot - 1)
2075			alu.last = 1;
2076		r = r600_bytecode_add_alu(ctx->bc, &alu);
2077		if (r)
2078			return r;
2079	}
2080	return 0;
2081}
2082
2083static int tgsi_trig(struct r600_shader_ctx *ctx)
2084{
2085	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2086	struct r600_bytecode_alu alu;
2087	int i, r;
2088	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2089
2090	r = tgsi_setup_trig(ctx);
2091	if (r)
2092		return r;
2093
2094	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2095	alu.inst = ctx->inst_info->r600_opcode;
2096	alu.dst.chan = 0;
2097	alu.dst.sel = ctx->temp_reg;
2098	alu.dst.write = 1;
2099
2100	alu.src[0].sel = ctx->temp_reg;
2101	alu.src[0].chan = 0;
2102	alu.last = 1;
2103	r = r600_bytecode_add_alu(ctx->bc, &alu);
2104	if (r)
2105		return r;
2106
2107	/* replicate result */
2108	for (i = 0; i < lasti + 1; i++) {
2109		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2110			continue;
2111
2112		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2113		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2114
2115		alu.src[0].sel = ctx->temp_reg;
2116		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2117		if (i == lasti)
2118			alu.last = 1;
2119		r = r600_bytecode_add_alu(ctx->bc, &alu);
2120		if (r)
2121			return r;
2122	}
2123	return 0;
2124}
2125
2126static int tgsi_scs(struct r600_shader_ctx *ctx)
2127{
2128	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2129	struct r600_bytecode_alu alu;
2130	int i, r;
2131
2132	/* We'll only need the trig stuff if we are going to write to the
2133	 * X or Y components of the destination vector.
2134	 */
2135	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2136		r = tgsi_setup_trig(ctx);
2137		if (r)
2138			return r;
2139	}
2140
2141	/* dst.x = COS */
2142	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2143		if (ctx->bc->chip_class == CAYMAN) {
2144			for (i = 0 ; i < 3; i++) {
2145				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2146				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2147				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2148
2149				if (i == 0)
2150					alu.dst.write = 1;
2151				else
2152					alu.dst.write = 0;
2153				alu.src[0].sel = ctx->temp_reg;
2154				alu.src[0].chan = 0;
2155				if (i == 2)
2156					alu.last = 1;
2157				r = r600_bytecode_add_alu(ctx->bc, &alu);
2158				if (r)
2159					return r;
2160			}
2161		} else {
2162			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2163			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2164			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2165
2166			alu.src[0].sel = ctx->temp_reg;
2167			alu.src[0].chan = 0;
2168			alu.last = 1;
2169			r = r600_bytecode_add_alu(ctx->bc, &alu);
2170			if (r)
2171				return r;
2172		}
2173	}
2174
2175	/* dst.y = SIN */
2176	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2177		if (ctx->bc->chip_class == CAYMAN) {
2178			for (i = 0 ; i < 3; i++) {
2179				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2180				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2181				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2182				if (i == 1)
2183					alu.dst.write = 1;
2184				else
2185					alu.dst.write = 0;
2186				alu.src[0].sel = ctx->temp_reg;
2187				alu.src[0].chan = 0;
2188				if (i == 2)
2189					alu.last = 1;
2190				r = r600_bytecode_add_alu(ctx->bc, &alu);
2191				if (r)
2192					return r;
2193			}
2194		} else {
2195			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2196			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2197			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2198
2199			alu.src[0].sel = ctx->temp_reg;
2200			alu.src[0].chan = 0;
2201			alu.last = 1;
2202			r = r600_bytecode_add_alu(ctx->bc, &alu);
2203			if (r)
2204				return r;
2205		}
2206	}
2207
2208	/* dst.z = 0.0; */
2209	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2210		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2211
2212		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2213
2214		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2215
2216		alu.src[0].sel = V_SQ_ALU_SRC_0;
2217		alu.src[0].chan = 0;
2218
2219		alu.last = 1;
2220
2221		r = r600_bytecode_add_alu(ctx->bc, &alu);
2222		if (r)
2223			return r;
2224	}
2225
2226	/* dst.w = 1.0; */
2227	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2228		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2229
2230		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2231
2232		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2233
2234		alu.src[0].sel = V_SQ_ALU_SRC_1;
2235		alu.src[0].chan = 0;
2236
2237		alu.last = 1;
2238
2239		r = r600_bytecode_add_alu(ctx->bc, &alu);
2240		if (r)
2241			return r;
2242	}
2243
2244	return 0;
2245}
2246
2247static int tgsi_kill(struct r600_shader_ctx *ctx)
2248{
2249	struct r600_bytecode_alu alu;
2250	int i, r;
2251
2252	for (i = 0; i < 4; i++) {
2253		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2254		alu.inst = ctx->inst_info->r600_opcode;
2255
2256		alu.dst.chan = i;
2257
2258		alu.src[0].sel = V_SQ_ALU_SRC_0;
2259
2260		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2261			alu.src[1].sel = V_SQ_ALU_SRC_1;
2262			alu.src[1].neg = 1;
2263		} else {
2264			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2265		}
2266		if (i == 3) {
2267			alu.last = 1;
2268		}
2269		r = r600_bytecode_add_alu(ctx->bc, &alu);
2270		if (r)
2271			return r;
2272	}
2273
2274	/* kill must be last in ALU */
2275	ctx->bc->force_add_cf = 1;
2276	ctx->shader->uses_kill = TRUE;
2277	return 0;
2278}
2279
2280static int tgsi_lit(struct r600_shader_ctx *ctx)
2281{
2282	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2283	struct r600_bytecode_alu alu;
2284	int r;
2285
2286	/* tmp.x = max(src.y, 0.0) */
2287	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2288	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2289	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2290	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2291	alu.src[1].chan = 1;
2292
2293	alu.dst.sel = ctx->temp_reg;
2294	alu.dst.chan = 0;
2295	alu.dst.write = 1;
2296
2297	alu.last = 1;
2298	r = r600_bytecode_add_alu(ctx->bc, &alu);
2299	if (r)
2300		return r;
2301
2302	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2303	{
2304		int chan;
2305		int sel;
2306		int i;
2307
2308		if (ctx->bc->chip_class == CAYMAN) {
2309			for (i = 0; i < 3; i++) {
2310				/* tmp.z = log(tmp.x) */
2311				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2312				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2313				alu.src[0].sel = ctx->temp_reg;
2314				alu.src[0].chan = 0;
2315				alu.dst.sel = ctx->temp_reg;
2316				alu.dst.chan = i;
2317				if (i == 2) {
2318					alu.dst.write = 1;
2319					alu.last = 1;
2320				} else
2321					alu.dst.write = 0;
2322
2323				r = r600_bytecode_add_alu(ctx->bc, &alu);
2324				if (r)
2325					return r;
2326			}
2327		} else {
2328			/* tmp.z = log(tmp.x) */
2329			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2330			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2331			alu.src[0].sel = ctx->temp_reg;
2332			alu.src[0].chan = 0;
2333			alu.dst.sel = ctx->temp_reg;
2334			alu.dst.chan = 2;
2335			alu.dst.write = 1;
2336			alu.last = 1;
2337			r = r600_bytecode_add_alu(ctx->bc, &alu);
2338			if (r)
2339				return r;
2340		}
2341
2342		chan = alu.dst.chan;
2343		sel = alu.dst.sel;
2344
2345		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2346		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2347		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2348		alu.src[0].sel  = sel;
2349		alu.src[0].chan = chan;
2350		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2351		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2352		alu.dst.sel = ctx->temp_reg;
2353		alu.dst.chan = 0;
2354		alu.dst.write = 1;
2355		alu.is_op3 = 1;
2356		alu.last = 1;
2357		r = r600_bytecode_add_alu(ctx->bc, &alu);
2358		if (r)
2359			return r;
2360
2361		if (ctx->bc->chip_class == CAYMAN) {
2362			for (i = 0; i < 3; i++) {
2363				/* dst.z = exp(tmp.x) */
2364				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2365				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2366				alu.src[0].sel = ctx->temp_reg;
2367				alu.src[0].chan = 0;
2368				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2369				if (i == 2) {
2370					alu.dst.write = 1;
2371					alu.last = 1;
2372				} else
2373					alu.dst.write = 0;
2374				r = r600_bytecode_add_alu(ctx->bc, &alu);
2375				if (r)
2376					return r;
2377			}
2378		} else {
2379			/* dst.z = exp(tmp.x) */
2380			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2381			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2382			alu.src[0].sel = ctx->temp_reg;
2383			alu.src[0].chan = 0;
2384			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2385			alu.last = 1;
2386			r = r600_bytecode_add_alu(ctx->bc, &alu);
2387			if (r)
2388				return r;
2389		}
2390	}
2391
2392	/* dst.x, <- 1.0  */
2393	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2394	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2395	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2396	alu.src[0].chan = 0;
2397	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2398	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2399	r = r600_bytecode_add_alu(ctx->bc, &alu);
2400	if (r)
2401		return r;
2402
2403	/* dst.y = max(src.x, 0.0) */
2404	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2405	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2406	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2407	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2408	alu.src[1].chan = 0;
2409	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2410	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2411	r = r600_bytecode_add_alu(ctx->bc, &alu);
2412	if (r)
2413		return r;
2414
2415	/* dst.w, <- 1.0  */
2416	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2417	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2418	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2419	alu.src[0].chan = 0;
2420	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2421	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2422	alu.last = 1;
2423	r = r600_bytecode_add_alu(ctx->bc, &alu);
2424	if (r)
2425		return r;
2426
2427	return 0;
2428}
2429
2430static int tgsi_rsq(struct r600_shader_ctx *ctx)
2431{
2432	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2433	struct r600_bytecode_alu alu;
2434	int i, r;
2435
2436	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2437
2438	/* XXX:
2439	 * For state trackers other than OpenGL, we'll want to use
2440	 * _RECIPSQRT_IEEE instead.
2441	 */
2442	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2443
2444	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2445		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2446		r600_bytecode_src_set_abs(&alu.src[i]);
2447	}
2448	alu.dst.sel = ctx->temp_reg;
2449	alu.dst.write = 1;
2450	alu.last = 1;
2451	r = r600_bytecode_add_alu(ctx->bc, &alu);
2452	if (r)
2453		return r;
2454	/* replicate result */
2455	return tgsi_helper_tempx_replicate(ctx);
2456}
2457
2458static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2459{
2460	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2461	struct r600_bytecode_alu alu;
2462	int i, r;
2463
2464	for (i = 0; i < 4; i++) {
2465		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2466		alu.src[0].sel = ctx->temp_reg;
2467		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2468		alu.dst.chan = i;
2469		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2470		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2471		if (i == 3)
2472			alu.last = 1;
2473		r = r600_bytecode_add_alu(ctx->bc, &alu);
2474		if (r)
2475			return r;
2476	}
2477	return 0;
2478}
2479
2480static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2481{
2482	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2483	struct r600_bytecode_alu alu;
2484	int i, r;
2485
2486	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2487	alu.inst = ctx->inst_info->r600_opcode;
2488	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2489		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2490	}
2491	alu.dst.sel = ctx->temp_reg;
2492	alu.dst.write = 1;
2493	alu.last = 1;
2494	r = r600_bytecode_add_alu(ctx->bc, &alu);
2495	if (r)
2496		return r;
2497	/* replicate result */
2498	return tgsi_helper_tempx_replicate(ctx);
2499}
2500
2501static int cayman_pow(struct r600_shader_ctx *ctx)
2502{
2503	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2504	int i, r;
2505	struct r600_bytecode_alu alu;
2506	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2507
2508	for (i = 0; i < 3; i++) {
2509		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2510		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2511		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2512		alu.dst.sel = ctx->temp_reg;
2513		alu.dst.chan = i;
2514		alu.dst.write = 1;
2515		if (i == 2)
2516			alu.last = 1;
2517		r = r600_bytecode_add_alu(ctx->bc, &alu);
2518		if (r)
2519			return r;
2520	}
2521
2522	/* b * LOG2(a) */
2523	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2524	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2525	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2526	alu.src[1].sel = ctx->temp_reg;
2527	alu.dst.sel = ctx->temp_reg;
2528	alu.dst.write = 1;
2529	alu.last = 1;
2530	r = r600_bytecode_add_alu(ctx->bc, &alu);
2531	if (r)
2532		return r;
2533
2534	for (i = 0; i < last_slot; i++) {
2535		/* POW(a,b) = EXP2(b * LOG2(a))*/
2536		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2537		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2538		alu.src[0].sel = ctx->temp_reg;
2539
2540		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2541		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2542		if (i == last_slot - 1)
2543			alu.last = 1;
2544		r = r600_bytecode_add_alu(ctx->bc, &alu);
2545		if (r)
2546			return r;
2547	}
2548	return 0;
2549}
2550
2551static int tgsi_pow(struct r600_shader_ctx *ctx)
2552{
2553	struct r600_bytecode_alu alu;
2554	int r;
2555
2556	/* LOG2(a) */
2557	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2558	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2559	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2560	alu.dst.sel = ctx->temp_reg;
2561	alu.dst.write = 1;
2562	alu.last = 1;
2563	r = r600_bytecode_add_alu(ctx->bc, &alu);
2564	if (r)
2565		return r;
2566	/* b * LOG2(a) */
2567	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2568	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2569	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2570	alu.src[1].sel = ctx->temp_reg;
2571	alu.dst.sel = ctx->temp_reg;
2572	alu.dst.write = 1;
2573	alu.last = 1;
2574	r = r600_bytecode_add_alu(ctx->bc, &alu);
2575	if (r)
2576		return r;
2577	/* POW(a,b) = EXP2(b * LOG2(a))*/
2578	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2579	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2580	alu.src[0].sel = ctx->temp_reg;
2581	alu.dst.sel = ctx->temp_reg;
2582	alu.dst.write = 1;
2583	alu.last = 1;
2584	r = r600_bytecode_add_alu(ctx->bc, &alu);
2585	if (r)
2586		return r;
2587	return tgsi_helper_tempx_replicate(ctx);
2588}
2589
2590static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2591{
2592	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2593	struct r600_bytecode_alu alu;
2594	int i, r, j;
2595	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2596	int tmp0 = ctx->temp_reg;
2597	int tmp1 = r600_get_temp(ctx);
2598	int tmp2 = r600_get_temp(ctx);
2599	int tmp3 = r600_get_temp(ctx);
2600	/* Unsigned path:
2601	 *
2602	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2603	 *
2604	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2605	 * 2. tmp0.z = lo (tmp0.x * src2)
2606	 * 3. tmp0.w = -tmp0.z
2607	 * 4. tmp0.y = hi (tmp0.x * src2)
2608	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2609	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2610	 * 7. tmp1.x = tmp0.x - tmp0.w
2611	 * 8. tmp1.y = tmp0.x + tmp0.w
2612	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2613	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2614	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2615	 *
2616	 * 12. tmp0.w = src1 - tmp0.y       = r
2617	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2618	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2619	 *
2620	 * if DIV
2621	 *
2622	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2623	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2624	 *
2625	 * else MOD
2626	 *
2627	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2628	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2629	 *
2630	 * endif
2631	 *
2632	 * 17. tmp1.x = tmp1.x & tmp1.y
2633	 *
2634	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2635	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2636	 *
2637	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2638	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2639	 *
2640	 * Signed path:
2641	 *
2642	 * Same as unsigned, using abs values of the operands,
2643	 * and fixing the sign of the result in the end.
2644	 */
2645
2646	for (i = 0; i < 4; i++) {
2647		if (!(write_mask & (1<<i)))
2648			continue;
2649
2650		if (signed_op) {
2651
2652			/* tmp2.x = -src0 */
2653			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2654			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2655
2656			alu.dst.sel = tmp2;
2657			alu.dst.chan = 0;
2658			alu.dst.write = 1;
2659
2660			alu.src[0].sel = V_SQ_ALU_SRC_0;
2661
2662			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2663
2664			alu.last = 1;
2665			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2666				return r;
2667
2668			/* tmp2.y = -src1 */
2669			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2670			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2671
2672			alu.dst.sel = tmp2;
2673			alu.dst.chan = 1;
2674			alu.dst.write = 1;
2675
2676			alu.src[0].sel = V_SQ_ALU_SRC_0;
2677
2678			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2679
2680			alu.last = 1;
2681			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2682				return r;
2683
2684			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2685			/* it will be a sign of the quotient */
2686			if (!mod) {
2687
2688				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2689				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2690
2691				alu.dst.sel = tmp2;
2692				alu.dst.chan = 2;
2693				alu.dst.write = 1;
2694
2695				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2696				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2697
2698				alu.last = 1;
2699				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2700					return r;
2701			}
2702
2703			/* tmp2.x = |src0| */
2704			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2705			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2706			alu.is_op3 = 1;
2707
2708			alu.dst.sel = tmp2;
2709			alu.dst.chan = 0;
2710			alu.dst.write = 1;
2711
2712			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2713			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2714			alu.src[2].sel = tmp2;
2715			alu.src[2].chan = 0;
2716
2717			alu.last = 1;
2718			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2719				return r;
2720
2721			/* tmp2.y = |src1| */
2722			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2723			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2724			alu.is_op3 = 1;
2725
2726			alu.dst.sel = tmp2;
2727			alu.dst.chan = 1;
2728			alu.dst.write = 1;
2729
2730			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2731			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2732			alu.src[2].sel = tmp2;
2733			alu.src[2].chan = 1;
2734
2735			alu.last = 1;
2736			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2737				return r;
2738
2739		}
2740
2741		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2742		if (ctx->bc->chip_class == CAYMAN) {
2743			/* tmp3.x = u2f(src2) */
2744			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2745			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2746
2747			alu.dst.sel = tmp3;
2748			alu.dst.chan = 0;
2749			alu.dst.write = 1;
2750
2751			if (signed_op) {
2752				alu.src[0].sel = tmp2;
2753				alu.src[0].chan = 1;
2754			} else {
2755				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2756			}
2757
2758			alu.last = 1;
2759			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2760				return r;
2761
2762			/* tmp0.x = recip(tmp3.x) */
2763			for (j = 0 ; j < 3; j++) {
2764				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2765				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2766
2767				alu.dst.sel = tmp0;
2768				alu.dst.chan = j;
2769				alu.dst.write = (j == 0);
2770
2771				alu.src[0].sel = tmp3;
2772				alu.src[0].chan = 0;
2773
2774				if (j == 2)
2775					alu.last = 1;
2776				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2777					return r;
2778			}
2779
2780			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2781			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2782
2783			alu.src[0].sel = tmp0;
2784			alu.src[0].chan = 0;
2785
2786			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2787			alu.src[1].value = 0x4f800000;
2788
2789			alu.dst.sel = tmp3;
2790			alu.dst.write = 1;
2791			alu.last = 1;
2792			r = r600_bytecode_add_alu(ctx->bc, &alu);
2793			if (r)
2794				return r;
2795
2796			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2797			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2798
2799			alu.dst.sel = tmp0;
2800			alu.dst.chan = 0;
2801			alu.dst.write = 1;
2802
2803			alu.src[0].sel = tmp3;
2804			alu.src[0].chan = 0;
2805
2806			alu.last = 1;
2807			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2808				return r;
2809
2810		} else {
2811			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2812			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2813
2814			alu.dst.sel = tmp0;
2815			alu.dst.chan = 0;
2816			alu.dst.write = 1;
2817
2818			if (signed_op) {
2819				alu.src[0].sel = tmp2;
2820				alu.src[0].chan = 1;
2821			} else {
2822				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2823			}
2824
2825			alu.last = 1;
2826			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2827				return r;
2828		}
2829
2830		/* 2. tmp0.z = lo (tmp0.x * src2) */
2831		if (ctx->bc->chip_class == CAYMAN) {
2832			for (j = 0 ; j < 4; j++) {
2833				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2834				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2835
2836				alu.dst.sel = tmp0;
2837				alu.dst.chan = j;
2838				alu.dst.write = (j == 2);
2839
2840				alu.src[0].sel = tmp0;
2841				alu.src[0].chan = 0;
2842				if (signed_op) {
2843					alu.src[1].sel = tmp2;
2844					alu.src[1].chan = 1;
2845				} else {
2846					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2847				}
2848
2849				alu.last = (j == 3);
2850				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2851					return r;
2852			}
2853		} else {
2854			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2855			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2856
2857			alu.dst.sel = tmp0;
2858			alu.dst.chan = 2;
2859			alu.dst.write = 1;
2860
2861			alu.src[0].sel = tmp0;
2862			alu.src[0].chan = 0;
2863			if (signed_op) {
2864				alu.src[1].sel = tmp2;
2865				alu.src[1].chan = 1;
2866			} else {
2867				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2868			}
2869
2870			alu.last = 1;
2871			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2872				return r;
2873		}
2874
2875		/* 3. tmp0.w = -tmp0.z */
2876		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2877		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2878
2879		alu.dst.sel = tmp0;
2880		alu.dst.chan = 3;
2881		alu.dst.write = 1;
2882
2883		alu.src[0].sel = V_SQ_ALU_SRC_0;
2884		alu.src[1].sel = tmp0;
2885		alu.src[1].chan = 2;
2886
2887		alu.last = 1;
2888		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2889			return r;
2890
2891		/* 4. tmp0.y = hi (tmp0.x * src2) */
2892		if (ctx->bc->chip_class == CAYMAN) {
2893			for (j = 0 ; j < 4; j++) {
2894				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2895				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2896
2897				alu.dst.sel = tmp0;
2898				alu.dst.chan = j;
2899				alu.dst.write = (j == 1);
2900
2901				alu.src[0].sel = tmp0;
2902				alu.src[0].chan = 0;
2903
2904				if (signed_op) {
2905					alu.src[1].sel = tmp2;
2906					alu.src[1].chan = 1;
2907				} else {
2908					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2909				}
2910				alu.last = (j == 3);
2911				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2912					return r;
2913			}
2914		} else {
2915			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2916			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2917
2918			alu.dst.sel = tmp0;
2919			alu.dst.chan = 1;
2920			alu.dst.write = 1;
2921
2922			alu.src[0].sel = tmp0;
2923			alu.src[0].chan = 0;
2924
2925			if (signed_op) {
2926				alu.src[1].sel = tmp2;
2927				alu.src[1].chan = 1;
2928			} else {
2929				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2930			}
2931
2932			alu.last = 1;
2933			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2934				return r;
2935		}
2936
2937		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2938		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2939		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2940		alu.is_op3 = 1;
2941
2942		alu.dst.sel = tmp0;
2943		alu.dst.chan = 2;
2944		alu.dst.write = 1;
2945
2946		alu.src[0].sel = tmp0;
2947		alu.src[0].chan = 1;
2948		alu.src[1].sel = tmp0;
2949		alu.src[1].chan = 3;
2950		alu.src[2].sel = tmp0;
2951		alu.src[2].chan = 2;
2952
2953		alu.last = 1;
2954		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2955			return r;
2956
2957		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2958		if (ctx->bc->chip_class == CAYMAN) {
2959			for (j = 0 ; j < 4; j++) {
2960				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2961				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2962
2963				alu.dst.sel = tmp0;
2964				alu.dst.chan = j;
2965				alu.dst.write = (j == 3);
2966
2967				alu.src[0].sel = tmp0;
2968				alu.src[0].chan = 2;
2969
2970				alu.src[1].sel = tmp0;
2971				alu.src[1].chan = 0;
2972
2973				alu.last = (j == 3);
2974				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2975					return r;
2976			}
2977		} else {
2978			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2979			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2980
2981			alu.dst.sel = tmp0;
2982			alu.dst.chan = 3;
2983			alu.dst.write = 1;
2984
2985			alu.src[0].sel = tmp0;
2986			alu.src[0].chan = 2;
2987
2988			alu.src[1].sel = tmp0;
2989			alu.src[1].chan = 0;
2990
2991			alu.last = 1;
2992			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2993				return r;
2994		}
2995
2996		/* 7. tmp1.x = tmp0.x - tmp0.w */
2997		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2998		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2999
3000		alu.dst.sel = tmp1;
3001		alu.dst.chan = 0;
3002		alu.dst.write = 1;
3003
3004		alu.src[0].sel = tmp0;
3005		alu.src[0].chan = 0;
3006		alu.src[1].sel = tmp0;
3007		alu.src[1].chan = 3;
3008
3009		alu.last = 1;
3010		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3011			return r;
3012
3013		/* 8. tmp1.y = tmp0.x + tmp0.w */
3014		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3015		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3016
3017		alu.dst.sel = tmp1;
3018		alu.dst.chan = 1;
3019		alu.dst.write = 1;
3020
3021		alu.src[0].sel = tmp0;
3022		alu.src[0].chan = 0;
3023		alu.src[1].sel = tmp0;
3024		alu.src[1].chan = 3;
3025
3026		alu.last = 1;
3027		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3028			return r;
3029
3030		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3031		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3032		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3033		alu.is_op3 = 1;
3034
3035		alu.dst.sel = tmp0;
3036		alu.dst.chan = 0;
3037		alu.dst.write = 1;
3038
3039		alu.src[0].sel = tmp0;
3040		alu.src[0].chan = 1;
3041		alu.src[1].sel = tmp1;
3042		alu.src[1].chan = 1;
3043		alu.src[2].sel = tmp1;
3044		alu.src[2].chan = 0;
3045
3046		alu.last = 1;
3047		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3048			return r;
3049
3050		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3051		if (ctx->bc->chip_class == CAYMAN) {
3052			for (j = 0 ; j < 4; j++) {
3053				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3054				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3055
3056				alu.dst.sel = tmp0;
3057				alu.dst.chan = j;
3058				alu.dst.write = (j == 2);
3059
3060				alu.src[0].sel = tmp0;
3061				alu.src[0].chan = 0;
3062
3063				if (signed_op) {
3064					alu.src[1].sel = tmp2;
3065					alu.src[1].chan = 0;
3066				} else {
3067					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3068				}
3069
3070				alu.last = (j == 3);
3071				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3072					return r;
3073			}
3074		} else {
3075			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3076			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3077
3078			alu.dst.sel = tmp0;
3079			alu.dst.chan = 2;
3080			alu.dst.write = 1;
3081
3082			alu.src[0].sel = tmp0;
3083			alu.src[0].chan = 0;
3084
3085			if (signed_op) {
3086				alu.src[1].sel = tmp2;
3087				alu.src[1].chan = 0;
3088			} else {
3089				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3090			}
3091
3092			alu.last = 1;
3093			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3094				return r;
3095		}
3096
3097		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3098		if (ctx->bc->chip_class == CAYMAN) {
3099			for (j = 0 ; j < 4; j++) {
3100				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3101				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3102
3103				alu.dst.sel = tmp0;
3104				alu.dst.chan = j;
3105				alu.dst.write = (j == 1);
3106
3107				if (signed_op) {
3108					alu.src[0].sel = tmp2;
3109					alu.src[0].chan = 1;
3110				} else {
3111					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3112				}
3113
3114				alu.src[1].sel = tmp0;
3115				alu.src[1].chan = 2;
3116
3117				alu.last = (j == 3);
3118				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3119					return r;
3120			}
3121		} else {
3122			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3123			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3124
3125			alu.dst.sel = tmp0;
3126			alu.dst.chan = 1;
3127			alu.dst.write = 1;
3128
3129			if (signed_op) {
3130				alu.src[0].sel = tmp2;
3131				alu.src[0].chan = 1;
3132			} else {
3133				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3134			}
3135
3136			alu.src[1].sel = tmp0;
3137			alu.src[1].chan = 2;
3138
3139			alu.last = 1;
3140			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3141				return r;
3142		}
3143
3144		/* 12. tmp0.w = src1 - tmp0.y       = r */
3145		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3146		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3147
3148		alu.dst.sel = tmp0;
3149		alu.dst.chan = 3;
3150		alu.dst.write = 1;
3151
3152		if (signed_op) {
3153			alu.src[0].sel = tmp2;
3154			alu.src[0].chan = 0;
3155		} else {
3156			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3157		}
3158
3159		alu.src[1].sel = tmp0;
3160		alu.src[1].chan = 1;
3161
3162		alu.last = 1;
3163		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3164			return r;
3165
3166		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3167		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3168		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3169
3170		alu.dst.sel = tmp1;
3171		alu.dst.chan = 0;
3172		alu.dst.write = 1;
3173
3174		alu.src[0].sel = tmp0;
3175		alu.src[0].chan = 3;
3176		if (signed_op) {
3177			alu.src[1].sel = tmp2;
3178			alu.src[1].chan = 1;
3179		} else {
3180			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3181		}
3182
3183		alu.last = 1;
3184		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3185			return r;
3186
3187		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3188		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3189		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3190
3191		alu.dst.sel = tmp1;
3192		alu.dst.chan = 1;
3193		alu.dst.write = 1;
3194
3195		if (signed_op) {
3196			alu.src[0].sel = tmp2;
3197			alu.src[0].chan = 0;
3198		} else {
3199			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3200		}
3201
3202		alu.src[1].sel = tmp0;
3203		alu.src[1].chan = 1;
3204
3205		alu.last = 1;
3206		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3207			return r;
3208
3209		if (mod) { /* UMOD */
3210
3211			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3212			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3213			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3214
3215			alu.dst.sel = tmp1;
3216			alu.dst.chan = 2;
3217			alu.dst.write = 1;
3218
3219			alu.src[0].sel = tmp0;
3220			alu.src[0].chan = 3;
3221
3222			if (signed_op) {
3223				alu.src[1].sel = tmp2;
3224				alu.src[1].chan = 1;
3225			} else {
3226				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3227			}
3228
3229			alu.last = 1;
3230			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3231				return r;
3232
3233			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3234			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3235			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3236
3237			alu.dst.sel = tmp1;
3238			alu.dst.chan = 3;
3239			alu.dst.write = 1;
3240
3241			alu.src[0].sel = tmp0;
3242			alu.src[0].chan = 3;
3243			if (signed_op) {
3244				alu.src[1].sel = tmp2;
3245				alu.src[1].chan = 1;
3246			} else {
3247				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3248			}
3249
3250			alu.last = 1;
3251			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3252				return r;
3253
3254		} else { /* UDIV */
3255
3256			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3257			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3258			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3259
3260			alu.dst.sel = tmp1;
3261			alu.dst.chan = 2;
3262			alu.dst.write = 1;
3263
3264			alu.src[0].sel = tmp0;
3265			alu.src[0].chan = 2;
3266			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3267
3268			alu.last = 1;
3269			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3270				return r;
3271
3272			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3273			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3274			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3275
3276			alu.dst.sel = tmp1;
3277			alu.dst.chan = 3;
3278			alu.dst.write = 1;
3279
3280			alu.src[0].sel = tmp0;
3281			alu.src[0].chan = 2;
3282			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3283
3284			alu.last = 1;
3285			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3286				return r;
3287
3288		}
3289
3290		/* 17. tmp1.x = tmp1.x & tmp1.y */
3291		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3292		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3293
3294		alu.dst.sel = tmp1;
3295		alu.dst.chan = 0;
3296		alu.dst.write = 1;
3297
3298		alu.src[0].sel = tmp1;
3299		alu.src[0].chan = 0;
3300		alu.src[1].sel = tmp1;
3301		alu.src[1].chan = 1;
3302
3303		alu.last = 1;
3304		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3305			return r;
3306
3307		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3308		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3309		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3310		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3311		alu.is_op3 = 1;
3312
3313		alu.dst.sel = tmp0;
3314		alu.dst.chan = 2;
3315		alu.dst.write = 1;
3316
3317		alu.src[0].sel = tmp1;
3318		alu.src[0].chan = 0;
3319		alu.src[1].sel = tmp0;
3320		alu.src[1].chan = mod ? 3 : 2;
3321		alu.src[2].sel = tmp1;
3322		alu.src[2].chan = 2;
3323
3324		alu.last = 1;
3325		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3326			return r;
3327
3328		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3329		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3330		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3331		alu.is_op3 = 1;
3332
3333		if (signed_op) {
3334			alu.dst.sel = tmp0;
3335			alu.dst.chan = 2;
3336			alu.dst.write = 1;
3337		} else {
3338			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3339		}
3340
3341		alu.src[0].sel = tmp1;
3342		alu.src[0].chan = 1;
3343		alu.src[1].sel = tmp1;
3344		alu.src[1].chan = 3;
3345		alu.src[2].sel = tmp0;
3346		alu.src[2].chan = 2;
3347
3348		alu.last = 1;
3349		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3350			return r;
3351
3352		if (signed_op) {
3353
3354			/* fix the sign of the result */
3355
3356			if (mod) {
3357
3358				/* tmp0.x = -tmp0.z */
3359				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3360				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3361
3362				alu.dst.sel = tmp0;
3363				alu.dst.chan = 0;
3364				alu.dst.write = 1;
3365
3366				alu.src[0].sel = V_SQ_ALU_SRC_0;
3367				alu.src[1].sel = tmp0;
3368				alu.src[1].chan = 2;
3369
3370				alu.last = 1;
3371				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3372					return r;
3373
3374				/* sign of the remainder is the same as the sign of src0 */
3375				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3376				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3377				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3378				alu.is_op3 = 1;
3379
3380				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3381
3382				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3383				alu.src[1].sel = tmp0;
3384				alu.src[1].chan = 2;
3385				alu.src[2].sel = tmp0;
3386				alu.src[2].chan = 0;
3387
3388				alu.last = 1;
3389				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3390					return r;
3391
3392			} else {
3393
3394				/* tmp0.x = -tmp0.z */
3395				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3396				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3397
3398				alu.dst.sel = tmp0;
3399				alu.dst.chan = 0;
3400				alu.dst.write = 1;
3401
3402				alu.src[0].sel = V_SQ_ALU_SRC_0;
3403				alu.src[1].sel = tmp0;
3404				alu.src[1].chan = 2;
3405
3406				alu.last = 1;
3407				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3408					return r;
3409
3410				/* fix the quotient sign (same as the sign of src0*src1) */
3411				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3412				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3413				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3414				alu.is_op3 = 1;
3415
3416				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3417
3418				alu.src[0].sel = tmp2;
3419				alu.src[0].chan = 2;
3420				alu.src[1].sel = tmp0;
3421				alu.src[1].chan = 2;
3422				alu.src[2].sel = tmp0;
3423				alu.src[2].chan = 0;
3424
3425				alu.last = 1;
3426				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3427					return r;
3428			}
3429		}
3430	}
3431	return 0;
3432}
3433
3434static int tgsi_udiv(struct r600_shader_ctx *ctx)
3435{
3436	return tgsi_divmod(ctx, 0, 0);
3437}
3438
3439static int tgsi_umod(struct r600_shader_ctx *ctx)
3440{
3441	return tgsi_divmod(ctx, 1, 0);
3442}
3443
3444static int tgsi_idiv(struct r600_shader_ctx *ctx)
3445{
3446	return tgsi_divmod(ctx, 0, 1);
3447}
3448
3449static int tgsi_imod(struct r600_shader_ctx *ctx)
3450{
3451	return tgsi_divmod(ctx, 1, 1);
3452}
3453
3454
3455static int tgsi_f2i(struct r600_shader_ctx *ctx)
3456{
3457	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3458	struct r600_bytecode_alu alu;
3459	int i, r;
3460	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3461	int last_inst = tgsi_last_instruction(write_mask);
3462
3463	for (i = 0; i < 4; i++) {
3464		if (!(write_mask & (1<<i)))
3465			continue;
3466
3467		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3468		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3469
3470		alu.dst.sel = ctx->temp_reg;
3471		alu.dst.chan = i;
3472		alu.dst.write = 1;
3473
3474		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3475		if (i == last_inst)
3476			alu.last = 1;
3477		r = r600_bytecode_add_alu(ctx->bc, &alu);
3478		if (r)
3479			return r;
3480	}
3481
3482	for (i = 0; i < 4; i++) {
3483		if (!(write_mask & (1<<i)))
3484			continue;
3485
3486		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3487		alu.inst = ctx->inst_info->r600_opcode;
3488
3489		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3490
3491		alu.src[0].sel = ctx->temp_reg;
3492		alu.src[0].chan = i;
3493
3494		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3495			alu.last = 1;
3496		r = r600_bytecode_add_alu(ctx->bc, &alu);
3497		if (r)
3498			return r;
3499	}
3500
3501	return 0;
3502}
3503
3504static int tgsi_iabs(struct r600_shader_ctx *ctx)
3505{
3506	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3507	struct r600_bytecode_alu alu;
3508	int i, r;
3509	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3510	int last_inst = tgsi_last_instruction(write_mask);
3511
3512	/* tmp = -src */
3513	for (i = 0; i < 4; i++) {
3514		if (!(write_mask & (1<<i)))
3515			continue;
3516
3517		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3518		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3519
3520		alu.dst.sel = ctx->temp_reg;
3521		alu.dst.chan = i;
3522		alu.dst.write = 1;
3523
3524		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3525		alu.src[0].sel = V_SQ_ALU_SRC_0;
3526
3527		if (i == last_inst)
3528			alu.last = 1;
3529		r = r600_bytecode_add_alu(ctx->bc, &alu);
3530		if (r)
3531			return r;
3532	}
3533
3534	/* dst = (src >= 0 ? src : tmp) */
3535	for (i = 0; i < 4; i++) {
3536		if (!(write_mask & (1<<i)))
3537			continue;
3538
3539		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3540		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3541		alu.is_op3 = 1;
3542		alu.dst.write = 1;
3543
3544		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3545
3546		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3547		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3548		alu.src[2].sel = ctx->temp_reg;
3549		alu.src[2].chan = i;
3550
3551		if (i == last_inst)
3552			alu.last = 1;
3553		r = r600_bytecode_add_alu(ctx->bc, &alu);
3554		if (r)
3555			return r;
3556	}
3557	return 0;
3558}
3559
3560static int tgsi_issg(struct r600_shader_ctx *ctx)
3561{
3562	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3563	struct r600_bytecode_alu alu;
3564	int i, r;
3565	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3566	int last_inst = tgsi_last_instruction(write_mask);
3567
3568	/* tmp = (src >= 0 ? src : -1) */
3569	for (i = 0; i < 4; i++) {
3570		if (!(write_mask & (1<<i)))
3571			continue;
3572
3573		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3574		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3575		alu.is_op3 = 1;
3576
3577		alu.dst.sel = ctx->temp_reg;
3578		alu.dst.chan = i;
3579		alu.dst.write = 1;
3580
3581		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3582		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3583		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3584
3585		if (i == last_inst)
3586			alu.last = 1;
3587		r = r600_bytecode_add_alu(ctx->bc, &alu);
3588		if (r)
3589			return r;
3590	}
3591
3592	/* dst = (tmp > 0 ? 1 : tmp) */
3593	for (i = 0; i < 4; i++) {
3594		if (!(write_mask & (1<<i)))
3595			continue;
3596
3597		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3598		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3599		alu.is_op3 = 1;
3600		alu.dst.write = 1;
3601
3602		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3603
3604		alu.src[0].sel = ctx->temp_reg;
3605		alu.src[0].chan = i;
3606
3607		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3608
3609		alu.src[2].sel = ctx->temp_reg;
3610		alu.src[2].chan = i;
3611
3612		if (i == last_inst)
3613			alu.last = 1;
3614		r = r600_bytecode_add_alu(ctx->bc, &alu);
3615		if (r)
3616			return r;
3617	}
3618	return 0;
3619}
3620
3621
3622
3623static int tgsi_ssg(struct r600_shader_ctx *ctx)
3624{
3625	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3626	struct r600_bytecode_alu alu;
3627	int i, r;
3628
3629	/* tmp = (src > 0 ? 1 : src) */
3630	for (i = 0; i < 4; i++) {
3631		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3632		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3633		alu.is_op3 = 1;
3634
3635		alu.dst.sel = ctx->temp_reg;
3636		alu.dst.chan = i;
3637
3638		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3639		alu.src[1].sel = V_SQ_ALU_SRC_1;
3640		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3641
3642		if (i == 3)
3643			alu.last = 1;
3644		r = r600_bytecode_add_alu(ctx->bc, &alu);
3645		if (r)
3646			return r;
3647	}
3648
3649	/* dst = (-tmp > 0 ? -1 : tmp) */
3650	for (i = 0; i < 4; i++) {
3651		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3652		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3653		alu.is_op3 = 1;
3654		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3655
3656		alu.src[0].sel = ctx->temp_reg;
3657		alu.src[0].chan = i;
3658		alu.src[0].neg = 1;
3659
3660		alu.src[1].sel = V_SQ_ALU_SRC_1;
3661		alu.src[1].neg = 1;
3662
3663		alu.src[2].sel = ctx->temp_reg;
3664		alu.src[2].chan = i;
3665
3666		if (i == 3)
3667			alu.last = 1;
3668		r = r600_bytecode_add_alu(ctx->bc, &alu);
3669		if (r)
3670			return r;
3671	}
3672	return 0;
3673}
3674
3675static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3676{
3677	struct r600_bytecode_alu alu;
3678	int i, r;
3679
3680	for (i = 0; i < 4; i++) {
3681		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3682		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3683			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3684			alu.dst.chan = i;
3685		} else {
3686			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3687			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3688			alu.src[0].sel = ctx->temp_reg;
3689			alu.src[0].chan = i;
3690		}
3691		if (i == 3) {
3692			alu.last = 1;
3693		}
3694		r = r600_bytecode_add_alu(ctx->bc, &alu);
3695		if (r)
3696			return r;
3697	}
3698	return 0;
3699}
3700
3701static int tgsi_op3(struct r600_shader_ctx *ctx)
3702{
3703	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3704	struct r600_bytecode_alu alu;
3705	int i, j, r;
3706	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3707
3708	for (i = 0; i < lasti + 1; i++) {
3709		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3710			continue;
3711
3712		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3713		alu.inst = ctx->inst_info->r600_opcode;
3714		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3715			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3716		}
3717
3718		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3719		alu.dst.chan = i;
3720		alu.dst.write = 1;
3721		alu.is_op3 = 1;
3722		if (i == lasti) {
3723			alu.last = 1;
3724		}
3725		r = r600_bytecode_add_alu(ctx->bc, &alu);
3726		if (r)
3727			return r;
3728	}
3729	return 0;
3730}
3731
3732static int tgsi_dp(struct r600_shader_ctx *ctx)
3733{
3734	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3735	struct r600_bytecode_alu alu;
3736	int i, j, r;
3737
3738	for (i = 0; i < 4; i++) {
3739		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3740		alu.inst = ctx->inst_info->r600_opcode;
3741		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3742			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3743		}
3744
3745		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3746		alu.dst.chan = i;
3747		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3748		/* handle some special cases */
3749		switch (ctx->inst_info->tgsi_opcode) {
3750		case TGSI_OPCODE_DP2:
3751			if (i > 1) {
3752				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3753				alu.src[0].chan = alu.src[1].chan = 0;
3754			}
3755			break;
3756		case TGSI_OPCODE_DP3:
3757			if (i > 2) {
3758				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3759				alu.src[0].chan = alu.src[1].chan = 0;
3760			}
3761			break;
3762		case TGSI_OPCODE_DPH:
3763			if (i == 3) {
3764				alu.src[0].sel = V_SQ_ALU_SRC_1;
3765				alu.src[0].chan = 0;
3766				alu.src[0].neg = 0;
3767			}
3768			break;
3769		default:
3770			break;
3771		}
3772		if (i == 3) {
3773			alu.last = 1;
3774		}
3775		r = r600_bytecode_add_alu(ctx->bc, &alu);
3776		if (r)
3777			return r;
3778	}
3779	return 0;
3780}
3781
3782static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3783						    unsigned index)
3784{
3785	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3786	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3787		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3788		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3789		ctx->src[index].neg || ctx->src[index].abs;
3790}
3791
3792static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3793					unsigned index)
3794{
3795	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3796	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3797}
3798
3799static int tgsi_tex(struct r600_shader_ctx *ctx)
3800{
3801	static float one_point_five = 1.5f;
3802	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3803	struct r600_bytecode_tex tex;
3804	struct r600_bytecode_alu alu;
3805	unsigned src_gpr;
3806	int r, i, j;
3807	int opcode;
3808	bool read_compressed_msaa = ctx->bc->msaa_texture_mode == MSAA_TEXTURE_COMPRESSED &&
3809				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
3810				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
3811				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
3812	/* Texture fetch instructions can only use gprs as source.
3813	 * Also they cannot negate the source or take the absolute value */
3814	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3815                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
3816					     read_compressed_msaa;
3817	boolean src_loaded = FALSE;
3818	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3819	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3820
3821	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3822
3823	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3824		/* get offset values */
3825		if (inst->Texture.NumOffsets) {
3826			assert(inst->Texture.NumOffsets == 1);
3827
3828			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3829			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3830			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3831		}
3832	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3833		/* TGSI moves the sampler to src reg 3 for TXD */
3834		sampler_src_reg = 3;
3835
3836		for (i = 1; i < 3; i++) {
3837			/* set gradients h/v */
3838			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3839			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3840				SQ_TEX_INST_SET_GRADIENTS_V;
3841			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3842			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3843
3844			if (tgsi_tex_src_requires_loading(ctx, i)) {
3845				tex.src_gpr = r600_get_temp(ctx);
3846				tex.src_sel_x = 0;
3847				tex.src_sel_y = 1;
3848				tex.src_sel_z = 2;
3849				tex.src_sel_w = 3;
3850
3851				for (j = 0; j < 4; j++) {
3852					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3853					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3854                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3855                                        alu.dst.sel = tex.src_gpr;
3856                                        alu.dst.chan = j;
3857                                        if (j == 3)
3858                                                alu.last = 1;
3859                                        alu.dst.write = 1;
3860                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3861                                        if (r)
3862                                                return r;
3863				}
3864
3865			} else {
3866				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3867				tex.src_sel_x = ctx->src[i].swizzle[0];
3868				tex.src_sel_y = ctx->src[i].swizzle[1];
3869				tex.src_sel_z = ctx->src[i].swizzle[2];
3870				tex.src_sel_w = ctx->src[i].swizzle[3];
3871				tex.src_rel = ctx->src[i].rel;
3872			}
3873			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3874			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3875			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3876				tex.coord_type_x = 1;
3877				tex.coord_type_y = 1;
3878				tex.coord_type_z = 1;
3879				tex.coord_type_w = 1;
3880			}
3881			r = r600_bytecode_add_tex(ctx->bc, &tex);
3882			if (r)
3883				return r;
3884		}
3885	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3886		int out_chan;
3887		/* Add perspective divide */
3888		if (ctx->bc->chip_class == CAYMAN) {
3889			out_chan = 2;
3890			for (i = 0; i < 3; i++) {
3891				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3892				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3893				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3894
3895				alu.dst.sel = ctx->temp_reg;
3896				alu.dst.chan = i;
3897				if (i == 2)
3898					alu.last = 1;
3899				if (out_chan == i)
3900					alu.dst.write = 1;
3901				r = r600_bytecode_add_alu(ctx->bc, &alu);
3902				if (r)
3903					return r;
3904			}
3905
3906		} else {
3907			out_chan = 3;
3908			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3909			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3910			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3911
3912			alu.dst.sel = ctx->temp_reg;
3913			alu.dst.chan = out_chan;
3914			alu.last = 1;
3915			alu.dst.write = 1;
3916			r = r600_bytecode_add_alu(ctx->bc, &alu);
3917			if (r)
3918				return r;
3919		}
3920
3921		for (i = 0; i < 3; i++) {
3922			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3923			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3924			alu.src[0].sel = ctx->temp_reg;
3925			alu.src[0].chan = out_chan;
3926			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3927			alu.dst.sel = ctx->temp_reg;
3928			alu.dst.chan = i;
3929			alu.dst.write = 1;
3930			r = r600_bytecode_add_alu(ctx->bc, &alu);
3931			if (r)
3932				return r;
3933		}
3934		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3935		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3936		alu.src[0].sel = V_SQ_ALU_SRC_1;
3937		alu.src[0].chan = 0;
3938		alu.dst.sel = ctx->temp_reg;
3939		alu.dst.chan = 3;
3940		alu.last = 1;
3941		alu.dst.write = 1;
3942		r = r600_bytecode_add_alu(ctx->bc, &alu);
3943		if (r)
3944			return r;
3945		src_loaded = TRUE;
3946		src_gpr = ctx->temp_reg;
3947	}
3948
3949	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3950	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3951	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3952	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3953
3954		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3955		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3956
3957		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3958		for (i = 0; i < 4; i++) {
3959			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3960			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3961			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3962			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3963			alu.dst.sel = ctx->temp_reg;
3964			alu.dst.chan = i;
3965			if (i == 3)
3966				alu.last = 1;
3967			alu.dst.write = 1;
3968			r = r600_bytecode_add_alu(ctx->bc, &alu);
3969			if (r)
3970				return r;
3971		}
3972
3973		/* tmp1.z = RCP_e(|tmp1.z|) */
3974		if (ctx->bc->chip_class == CAYMAN) {
3975			for (i = 0; i < 3; i++) {
3976				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3977				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3978				alu.src[0].sel = ctx->temp_reg;
3979				alu.src[0].chan = 2;
3980				alu.src[0].abs = 1;
3981				alu.dst.sel = ctx->temp_reg;
3982				alu.dst.chan = i;
3983				if (i == 2)
3984					alu.dst.write = 1;
3985				if (i == 2)
3986					alu.last = 1;
3987				r = r600_bytecode_add_alu(ctx->bc, &alu);
3988				if (r)
3989					return r;
3990			}
3991		} else {
3992			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3993			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3994			alu.src[0].sel = ctx->temp_reg;
3995			alu.src[0].chan = 2;
3996			alu.src[0].abs = 1;
3997			alu.dst.sel = ctx->temp_reg;
3998			alu.dst.chan = 2;
3999			alu.dst.write = 1;
4000			alu.last = 1;
4001			r = r600_bytecode_add_alu(ctx->bc, &alu);
4002			if (r)
4003				return r;
4004		}
4005
4006		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
4007		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
4008		 * muladd has no writemask, have to use another temp
4009		 */
4010		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4011		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4012		alu.is_op3 = 1;
4013
4014		alu.src[0].sel = ctx->temp_reg;
4015		alu.src[0].chan = 0;
4016		alu.src[1].sel = ctx->temp_reg;
4017		alu.src[1].chan = 2;
4018
4019		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4020		alu.src[2].chan = 0;
4021		alu.src[2].value = *(uint32_t *)&one_point_five;
4022
4023		alu.dst.sel = ctx->temp_reg;
4024		alu.dst.chan = 0;
4025		alu.dst.write = 1;
4026
4027		r = r600_bytecode_add_alu(ctx->bc, &alu);
4028		if (r)
4029			return r;
4030
4031		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4032		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4033		alu.is_op3 = 1;
4034
4035		alu.src[0].sel = ctx->temp_reg;
4036		alu.src[0].chan = 1;
4037		alu.src[1].sel = ctx->temp_reg;
4038		alu.src[1].chan = 2;
4039
4040		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4041		alu.src[2].chan = 0;
4042		alu.src[2].value = *(uint32_t *)&one_point_five;
4043
4044		alu.dst.sel = ctx->temp_reg;
4045		alu.dst.chan = 1;
4046		alu.dst.write = 1;
4047
4048		alu.last = 1;
4049		r = r600_bytecode_add_alu(ctx->bc, &alu);
4050		if (r)
4051			return r;
4052		/* write initial W value into Z component */
4053		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4054			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4055			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4056			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4057			alu.dst.sel = ctx->temp_reg;
4058			alu.dst.chan = 2;
4059			alu.dst.write = 1;
4060			alu.last = 1;
4061			r = r600_bytecode_add_alu(ctx->bc, &alu);
4062			if (r)
4063				return r;
4064		}
4065		src_loaded = TRUE;
4066		src_gpr = ctx->temp_reg;
4067	}
4068
4069	if (src_requires_loading && !src_loaded) {
4070		for (i = 0; i < 4; i++) {
4071			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4072			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4073			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4074			alu.dst.sel = ctx->temp_reg;
4075			alu.dst.chan = i;
4076			if (i == 3)
4077				alu.last = 1;
4078			alu.dst.write = 1;
4079			r = r600_bytecode_add_alu(ctx->bc, &alu);
4080			if (r)
4081				return r;
4082		}
4083		src_loaded = TRUE;
4084		src_gpr = ctx->temp_reg;
4085	}
4086
4087	/* Obtain the sample index for reading a compressed MSAA color texture.
4088	 * To read the FMASK, we use the ldfptr instruction, which tells us
4089	 * where the samples are stored.
4090	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
4091	 * which is the identity mapping. Each nibble says which physical sample
4092	 * should be fetched to get that sample.
4093	 *
4094	 * Assume src.z contains the sample index. It should be modified like this:
4095	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
4096	 * Then fetch the texel with src.
4097	 */
4098	if (read_compressed_msaa) {
4099		unsigned sample_chan = inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ? 3 : 4;
4100		unsigned temp = r600_get_temp(ctx);
4101		assert(src_loaded);
4102
4103		/* temp.w = ldfptr() */
4104		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4105		tex.inst = SQ_TEX_INST_LD;
4106		tex.inst_mod = 1; /* to indicate this is ldfptr */
4107		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4108		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4109		tex.src_gpr = src_gpr;
4110		tex.dst_gpr = temp;
4111		tex.dst_sel_x = 7; /* mask out these components */
4112		tex.dst_sel_y = 7;
4113		tex.dst_sel_z = 7;
4114		tex.dst_sel_w = 0; /* store X */
4115		tex.src_sel_x = 0;
4116		tex.src_sel_y = 1;
4117		tex.src_sel_z = 2;
4118		tex.src_sel_w = 3;
4119		tex.offset_x = offset_x;
4120		tex.offset_y = offset_y;
4121		tex.offset_z = offset_z;
4122		r = r600_bytecode_add_tex(ctx->bc, &tex);
4123		if (r)
4124			return r;
4125
4126		/* temp.x = sample_index*4 */
4127		if (ctx->bc->chip_class == CAYMAN) {
4128			for (i = 0 ; i < 4; i++) {
4129				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4130				alu.inst = ctx->inst_info->r600_opcode;
4131				alu.src[0].sel = src_gpr;
4132				alu.src[0].chan = sample_chan;
4133				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4134				alu.src[1].value = 4;
4135				alu.dst.sel = temp;
4136				alu.dst.chan = i;
4137				alu.dst.write = i == 0;
4138				if (i == 3)
4139					alu.last = 1;
4140				r = r600_bytecode_add_alu(ctx->bc, &alu);
4141				if (r)
4142					return r;
4143			}
4144		} else {
4145			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4146			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT);
4147			alu.src[0].sel = src_gpr;
4148			alu.src[0].chan = sample_chan;
4149			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4150			alu.src[1].value = 4;
4151			alu.dst.sel = temp;
4152			alu.dst.chan = 0;
4153			alu.dst.write = 1;
4154			alu.last = 1;
4155			r = r600_bytecode_add_alu(ctx->bc, &alu);
4156			if (r)
4157				return r;
4158		}
4159
4160		/* sample_index = temp.w >> temp.x */
4161		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4162		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT);
4163		alu.src[0].sel = temp;
4164		alu.src[0].chan = 3;
4165		alu.src[1].sel = temp;
4166		alu.src[1].chan = 0;
4167		alu.dst.sel = src_gpr;
4168		alu.dst.chan = sample_chan;
4169		alu.dst.write = 1;
4170		alu.last = 1;
4171		r = r600_bytecode_add_alu(ctx->bc, &alu);
4172		if (r)
4173			return r;
4174
4175		/* sample_index & 0xF */
4176		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4177		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
4178		alu.src[0].sel = src_gpr;
4179		alu.src[0].chan = sample_chan;
4180		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4181		alu.src[1].value = 0xF;
4182		alu.dst.sel = src_gpr;
4183		alu.dst.chan = sample_chan;
4184		alu.dst.write = 1;
4185		alu.last = 1;
4186		r = r600_bytecode_add_alu(ctx->bc, &alu);
4187		if (r)
4188			return r;
4189#if 0
4190		/* visualize the FMASK */
4191		for (i = 0; i < 4; i++) {
4192			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4193			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
4194			alu.src[0].sel = src_gpr;
4195			alu.src[0].chan = sample_chan;
4196			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4197			alu.dst.chan = i;
4198			alu.dst.write = 1;
4199			alu.last = 1;
4200			r = r600_bytecode_add_alu(ctx->bc, &alu);
4201			if (r)
4202				return r;
4203		}
4204		return 0;
4205#endif
4206	}
4207
4208	opcode = ctx->inst_info->r600_opcode;
4209	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4210	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4211	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4212	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4213	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4214	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4215		switch (opcode) {
4216		case SQ_TEX_INST_SAMPLE:
4217			opcode = SQ_TEX_INST_SAMPLE_C;
4218			break;
4219		case SQ_TEX_INST_SAMPLE_L:
4220			opcode = SQ_TEX_INST_SAMPLE_C_L;
4221			break;
4222		case SQ_TEX_INST_SAMPLE_LB:
4223			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4224			break;
4225		case SQ_TEX_INST_SAMPLE_G:
4226			opcode = SQ_TEX_INST_SAMPLE_C_G;
4227			break;
4228		}
4229	}
4230
4231	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4232	tex.inst = opcode;
4233
4234	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4235	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4236	tex.src_gpr = src_gpr;
4237	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4238	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4239	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4240	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4241	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4242
4243	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4244		tex.src_sel_x = 4;
4245		tex.src_sel_y = 4;
4246		tex.src_sel_z = 4;
4247		tex.src_sel_w = 4;
4248	} else if (src_loaded) {
4249		tex.src_sel_x = 0;
4250		tex.src_sel_y = 1;
4251		tex.src_sel_z = 2;
4252		tex.src_sel_w = 3;
4253	} else {
4254		tex.src_sel_x = ctx->src[0].swizzle[0];
4255		tex.src_sel_y = ctx->src[0].swizzle[1];
4256		tex.src_sel_z = ctx->src[0].swizzle[2];
4257		tex.src_sel_w = ctx->src[0].swizzle[3];
4258		tex.src_rel = ctx->src[0].rel;
4259	}
4260
4261	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4262		tex.src_sel_x = 1;
4263		tex.src_sel_y = 0;
4264		tex.src_sel_z = 3;
4265		tex.src_sel_w = 1;
4266	}
4267	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4268		tex.src_sel_x = 1;
4269		tex.src_sel_y = 0;
4270		tex.src_sel_z = 3;
4271		tex.src_sel_w = 2; /* route Z compare value into W */
4272	}
4273
4274	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4275	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4276		tex.coord_type_x = 1;
4277		tex.coord_type_y = 1;
4278	}
4279	tex.coord_type_z = 1;
4280	tex.coord_type_w = 1;
4281
4282	tex.offset_x = offset_x;
4283	tex.offset_y = offset_y;
4284	tex.offset_z = offset_z;
4285
4286	/* Put the depth for comparison in W.
4287	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4288	 * Some instructions expect the depth in Z. */
4289	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4290	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4291	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4292	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4293	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4294	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4295		tex.src_sel_w = tex.src_sel_z;
4296	}
4297
4298	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4299	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4300		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4301		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4302			/* the array index is read from Y */
4303			tex.coord_type_y = 0;
4304		} else {
4305			/* the array index is read from Z */
4306			tex.coord_type_z = 0;
4307			tex.src_sel_z = tex.src_sel_y;
4308		}
4309	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4310		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4311		/* the array index is read from Z */
4312		tex.coord_type_z = 0;
4313
4314	r = r600_bytecode_add_tex(ctx->bc, &tex);
4315	if (r)
4316		return r;
4317
4318	/* add shadow ambient support  - gallium doesn't do it yet */
4319	return 0;
4320}
4321
4322static int tgsi_lrp(struct r600_shader_ctx *ctx)
4323{
4324	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4325	struct r600_bytecode_alu alu;
4326	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4327	unsigned i;
4328	int r;
4329
4330	/* optimize if it's just an equal balance */
4331	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4332		for (i = 0; i < lasti + 1; i++) {
4333			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4334				continue;
4335
4336			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4337			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4338			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4339			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4340			alu.omod = 3;
4341			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4342			alu.dst.chan = i;
4343			if (i == lasti) {
4344				alu.last = 1;
4345			}
4346			r = r600_bytecode_add_alu(ctx->bc, &alu);
4347			if (r)
4348				return r;
4349		}
4350		return 0;
4351	}
4352
4353	/* 1 - src0 */
4354	for (i = 0; i < lasti + 1; i++) {
4355		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4356			continue;
4357
4358		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4359		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4360		alu.src[0].sel = V_SQ_ALU_SRC_1;
4361		alu.src[0].chan = 0;
4362		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4363		r600_bytecode_src_toggle_neg(&alu.src[1]);
4364		alu.dst.sel = ctx->temp_reg;
4365		alu.dst.chan = i;
4366		if (i == lasti) {
4367			alu.last = 1;
4368		}
4369		alu.dst.write = 1;
4370		r = r600_bytecode_add_alu(ctx->bc, &alu);
4371		if (r)
4372			return r;
4373	}
4374
4375	/* (1 - src0) * src2 */
4376	for (i = 0; i < lasti + 1; i++) {
4377		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4378			continue;
4379
4380		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4381		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4382		alu.src[0].sel = ctx->temp_reg;
4383		alu.src[0].chan = i;
4384		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4385		alu.dst.sel = ctx->temp_reg;
4386		alu.dst.chan = i;
4387		if (i == lasti) {
4388			alu.last = 1;
4389		}
4390		alu.dst.write = 1;
4391		r = r600_bytecode_add_alu(ctx->bc, &alu);
4392		if (r)
4393			return r;
4394	}
4395
4396	/* src0 * src1 + (1 - src0) * src2 */
4397	for (i = 0; i < lasti + 1; i++) {
4398		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4399			continue;
4400
4401		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4402		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4403		alu.is_op3 = 1;
4404		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4405		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4406		alu.src[2].sel = ctx->temp_reg;
4407		alu.src[2].chan = i;
4408
4409		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4410		alu.dst.chan = i;
4411		if (i == lasti) {
4412			alu.last = 1;
4413		}
4414		r = r600_bytecode_add_alu(ctx->bc, &alu);
4415		if (r)
4416			return r;
4417	}
4418	return 0;
4419}
4420
4421static int tgsi_cmp(struct r600_shader_ctx *ctx)
4422{
4423	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4424	struct r600_bytecode_alu alu;
4425	int i, r;
4426	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4427
4428	for (i = 0; i < lasti + 1; i++) {
4429		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4430			continue;
4431
4432		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4433		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4434		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4435		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4436		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4437		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4438		alu.dst.chan = i;
4439		alu.dst.write = 1;
4440		alu.is_op3 = 1;
4441		if (i == lasti)
4442			alu.last = 1;
4443		r = r600_bytecode_add_alu(ctx->bc, &alu);
4444		if (r)
4445			return r;
4446	}
4447	return 0;
4448}
4449
4450static int tgsi_xpd(struct r600_shader_ctx *ctx)
4451{
4452	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4453	static const unsigned int src0_swizzle[] = {2, 0, 1};
4454	static const unsigned int src1_swizzle[] = {1, 2, 0};
4455	struct r600_bytecode_alu alu;
4456	uint32_t use_temp = 0;
4457	int i, r;
4458
4459	if (inst->Dst[0].Register.WriteMask != 0xf)
4460		use_temp = 1;
4461
4462	for (i = 0; i < 4; i++) {
4463		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4464		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4465		if (i < 3) {
4466			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4467			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4468		} else {
4469			alu.src[0].sel = V_SQ_ALU_SRC_0;
4470			alu.src[0].chan = i;
4471			alu.src[1].sel = V_SQ_ALU_SRC_0;
4472			alu.src[1].chan = i;
4473		}
4474
4475		alu.dst.sel = ctx->temp_reg;
4476		alu.dst.chan = i;
4477		alu.dst.write = 1;
4478
4479		if (i == 3)
4480			alu.last = 1;
4481		r = r600_bytecode_add_alu(ctx->bc, &alu);
4482		if (r)
4483			return r;
4484	}
4485
4486	for (i = 0; i < 4; i++) {
4487		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4488		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4489
4490		if (i < 3) {
4491			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4492			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4493		} else {
4494			alu.src[0].sel = V_SQ_ALU_SRC_0;
4495			alu.src[0].chan = i;
4496			alu.src[1].sel = V_SQ_ALU_SRC_0;
4497			alu.src[1].chan = i;
4498		}
4499
4500		alu.src[2].sel = ctx->temp_reg;
4501		alu.src[2].neg = 1;
4502		alu.src[2].chan = i;
4503
4504		if (use_temp)
4505			alu.dst.sel = ctx->temp_reg;
4506		else
4507			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4508		alu.dst.chan = i;
4509		alu.dst.write = 1;
4510		alu.is_op3 = 1;
4511		if (i == 3)
4512			alu.last = 1;
4513		r = r600_bytecode_add_alu(ctx->bc, &alu);
4514		if (r)
4515			return r;
4516	}
4517	if (use_temp)
4518		return tgsi_helper_copy(ctx, inst);
4519	return 0;
4520}
4521
4522static int tgsi_exp(struct r600_shader_ctx *ctx)
4523{
4524	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4525	struct r600_bytecode_alu alu;
4526	int r;
4527	int i;
4528
4529	/* result.x = 2^floor(src); */
4530	if (inst->Dst[0].Register.WriteMask & 1) {
4531		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4532
4533		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4534		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4535
4536		alu.dst.sel = ctx->temp_reg;
4537		alu.dst.chan = 0;
4538		alu.dst.write = 1;
4539		alu.last = 1;
4540		r = r600_bytecode_add_alu(ctx->bc, &alu);
4541		if (r)
4542			return r;
4543
4544		if (ctx->bc->chip_class == CAYMAN) {
4545			for (i = 0; i < 3; i++) {
4546				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4547				alu.src[0].sel = ctx->temp_reg;
4548				alu.src[0].chan = 0;
4549
4550				alu.dst.sel = ctx->temp_reg;
4551				alu.dst.chan = i;
4552				alu.dst.write = i == 0;
4553				alu.last = i == 2;
4554				r = r600_bytecode_add_alu(ctx->bc, &alu);
4555				if (r)
4556					return r;
4557			}
4558		} else {
4559			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4560			alu.src[0].sel = ctx->temp_reg;
4561			alu.src[0].chan = 0;
4562
4563			alu.dst.sel = ctx->temp_reg;
4564			alu.dst.chan = 0;
4565			alu.dst.write = 1;
4566			alu.last = 1;
4567			r = r600_bytecode_add_alu(ctx->bc, &alu);
4568			if (r)
4569				return r;
4570		}
4571	}
4572
4573	/* result.y = tmp - floor(tmp); */
4574	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4575		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4576
4577		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4578		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4579
4580		alu.dst.sel = ctx->temp_reg;
4581#if 0
4582		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4583		if (r)
4584			return r;
4585#endif
4586		alu.dst.write = 1;
4587		alu.dst.chan = 1;
4588
4589		alu.last = 1;
4590
4591		r = r600_bytecode_add_alu(ctx->bc, &alu);
4592		if (r)
4593			return r;
4594	}
4595
4596	/* result.z = RoughApprox2ToX(tmp);*/
4597	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4598		if (ctx->bc->chip_class == CAYMAN) {
4599			for (i = 0; i < 3; i++) {
4600				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4601				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4602				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4603
4604				alu.dst.sel = ctx->temp_reg;
4605				alu.dst.chan = i;
4606				if (i == 2) {
4607					alu.dst.write = 1;
4608					alu.last = 1;
4609				}
4610
4611				r = r600_bytecode_add_alu(ctx->bc, &alu);
4612				if (r)
4613					return r;
4614			}
4615		} else {
4616			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4617			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4618			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4619
4620			alu.dst.sel = ctx->temp_reg;
4621			alu.dst.write = 1;
4622			alu.dst.chan = 2;
4623
4624			alu.last = 1;
4625
4626			r = r600_bytecode_add_alu(ctx->bc, &alu);
4627			if (r)
4628				return r;
4629		}
4630	}
4631
4632	/* result.w = 1.0;*/
4633	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4634		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4635
4636		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4637		alu.src[0].sel = V_SQ_ALU_SRC_1;
4638		alu.src[0].chan = 0;
4639
4640		alu.dst.sel = ctx->temp_reg;
4641		alu.dst.chan = 3;
4642		alu.dst.write = 1;
4643		alu.last = 1;
4644		r = r600_bytecode_add_alu(ctx->bc, &alu);
4645		if (r)
4646			return r;
4647	}
4648	return tgsi_helper_copy(ctx, inst);
4649}
4650
4651static int tgsi_log(struct r600_shader_ctx *ctx)
4652{
4653	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4654	struct r600_bytecode_alu alu;
4655	int r;
4656	int i;
4657
4658	/* result.x = floor(log2(|src|)); */
4659	if (inst->Dst[0].Register.WriteMask & 1) {
4660		if (ctx->bc->chip_class == CAYMAN) {
4661			for (i = 0; i < 3; i++) {
4662				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4663
4664				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4665				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4666				r600_bytecode_src_set_abs(&alu.src[0]);
4667
4668				alu.dst.sel = ctx->temp_reg;
4669				alu.dst.chan = i;
4670				if (i == 0)
4671					alu.dst.write = 1;
4672				if (i == 2)
4673					alu.last = 1;
4674				r = r600_bytecode_add_alu(ctx->bc, &alu);
4675				if (r)
4676					return r;
4677			}
4678
4679		} else {
4680			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4681
4682			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4683			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4684			r600_bytecode_src_set_abs(&alu.src[0]);
4685
4686			alu.dst.sel = ctx->temp_reg;
4687			alu.dst.chan = 0;
4688			alu.dst.write = 1;
4689			alu.last = 1;
4690			r = r600_bytecode_add_alu(ctx->bc, &alu);
4691			if (r)
4692				return r;
4693		}
4694
4695		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4696		alu.src[0].sel = ctx->temp_reg;
4697		alu.src[0].chan = 0;
4698
4699		alu.dst.sel = ctx->temp_reg;
4700		alu.dst.chan = 0;
4701		alu.dst.write = 1;
4702		alu.last = 1;
4703
4704		r = r600_bytecode_add_alu(ctx->bc, &alu);
4705		if (r)
4706			return r;
4707	}
4708
4709	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4710	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4711
4712		if (ctx->bc->chip_class == CAYMAN) {
4713			for (i = 0; i < 3; i++) {
4714				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4715
4716				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4717				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4718				r600_bytecode_src_set_abs(&alu.src[0]);
4719
4720				alu.dst.sel = ctx->temp_reg;
4721				alu.dst.chan = i;
4722				if (i == 1)
4723					alu.dst.write = 1;
4724				if (i == 2)
4725					alu.last = 1;
4726
4727				r = r600_bytecode_add_alu(ctx->bc, &alu);
4728				if (r)
4729					return r;
4730			}
4731		} else {
4732			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4733
4734			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4735			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4736			r600_bytecode_src_set_abs(&alu.src[0]);
4737
4738			alu.dst.sel = ctx->temp_reg;
4739			alu.dst.chan = 1;
4740			alu.dst.write = 1;
4741			alu.last = 1;
4742
4743			r = r600_bytecode_add_alu(ctx->bc, &alu);
4744			if (r)
4745				return r;
4746		}
4747
4748		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4749
4750		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4751		alu.src[0].sel = ctx->temp_reg;
4752		alu.src[0].chan = 1;
4753
4754		alu.dst.sel = ctx->temp_reg;
4755		alu.dst.chan = 1;
4756		alu.dst.write = 1;
4757		alu.last = 1;
4758
4759		r = r600_bytecode_add_alu(ctx->bc, &alu);
4760		if (r)
4761			return r;
4762
4763		if (ctx->bc->chip_class == CAYMAN) {
4764			for (i = 0; i < 3; i++) {
4765				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4766				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4767				alu.src[0].sel = ctx->temp_reg;
4768				alu.src[0].chan = 1;
4769
4770				alu.dst.sel = ctx->temp_reg;
4771				alu.dst.chan = i;
4772				if (i == 1)
4773					alu.dst.write = 1;
4774				if (i == 2)
4775					alu.last = 1;
4776
4777				r = r600_bytecode_add_alu(ctx->bc, &alu);
4778				if (r)
4779					return r;
4780			}
4781		} else {
4782			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4783			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4784			alu.src[0].sel = ctx->temp_reg;
4785			alu.src[0].chan = 1;
4786
4787			alu.dst.sel = ctx->temp_reg;
4788			alu.dst.chan = 1;
4789			alu.dst.write = 1;
4790			alu.last = 1;
4791
4792			r = r600_bytecode_add_alu(ctx->bc, &alu);
4793			if (r)
4794				return r;
4795		}
4796
4797		if (ctx->bc->chip_class == CAYMAN) {
4798			for (i = 0; i < 3; i++) {
4799				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4800				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4801				alu.src[0].sel = ctx->temp_reg;
4802				alu.src[0].chan = 1;
4803
4804				alu.dst.sel = ctx->temp_reg;
4805				alu.dst.chan = i;
4806				if (i == 1)
4807					alu.dst.write = 1;
4808				if (i == 2)
4809					alu.last = 1;
4810
4811				r = r600_bytecode_add_alu(ctx->bc, &alu);
4812				if (r)
4813					return r;
4814			}
4815		} else {
4816			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4817			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4818			alu.src[0].sel = ctx->temp_reg;
4819			alu.src[0].chan = 1;
4820
4821			alu.dst.sel = ctx->temp_reg;
4822			alu.dst.chan = 1;
4823			alu.dst.write = 1;
4824			alu.last = 1;
4825
4826			r = r600_bytecode_add_alu(ctx->bc, &alu);
4827			if (r)
4828				return r;
4829		}
4830
4831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4832
4833		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4834
4835		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4836		r600_bytecode_src_set_abs(&alu.src[0]);
4837
4838		alu.src[1].sel = ctx->temp_reg;
4839		alu.src[1].chan = 1;
4840
4841		alu.dst.sel = ctx->temp_reg;
4842		alu.dst.chan = 1;
4843		alu.dst.write = 1;
4844		alu.last = 1;
4845
4846		r = r600_bytecode_add_alu(ctx->bc, &alu);
4847		if (r)
4848			return r;
4849	}
4850
4851	/* result.z = log2(|src|);*/
4852	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4853		if (ctx->bc->chip_class == CAYMAN) {
4854			for (i = 0; i < 3; i++) {
4855				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856
4857				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4858				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4859				r600_bytecode_src_set_abs(&alu.src[0]);
4860
4861				alu.dst.sel = ctx->temp_reg;
4862				if (i == 2)
4863					alu.dst.write = 1;
4864				alu.dst.chan = i;
4865				if (i == 2)
4866					alu.last = 1;
4867
4868				r = r600_bytecode_add_alu(ctx->bc, &alu);
4869				if (r)
4870					return r;
4871			}
4872		} else {
4873			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4874
4875			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4876			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4877			r600_bytecode_src_set_abs(&alu.src[0]);
4878
4879			alu.dst.sel = ctx->temp_reg;
4880			alu.dst.write = 1;
4881			alu.dst.chan = 2;
4882			alu.last = 1;
4883
4884			r = r600_bytecode_add_alu(ctx->bc, &alu);
4885			if (r)
4886				return r;
4887		}
4888	}
4889
4890	/* result.w = 1.0; */
4891	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4892		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4893
4894		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4895		alu.src[0].sel = V_SQ_ALU_SRC_1;
4896		alu.src[0].chan = 0;
4897
4898		alu.dst.sel = ctx->temp_reg;
4899		alu.dst.chan = 3;
4900		alu.dst.write = 1;
4901		alu.last = 1;
4902
4903		r = r600_bytecode_add_alu(ctx->bc, &alu);
4904		if (r)
4905			return r;
4906	}
4907
4908	return tgsi_helper_copy(ctx, inst);
4909}
4910
4911static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4912{
4913	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4914	struct r600_bytecode_alu alu;
4915	int r;
4916
4917	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4918
4919	switch (inst->Instruction.Opcode) {
4920	case TGSI_OPCODE_ARL:
4921		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4922		break;
4923	case TGSI_OPCODE_ARR:
4924		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4925		break;
4926	case TGSI_OPCODE_UARL:
4927		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4928		break;
4929	default:
4930		assert(0);
4931		return -1;
4932	}
4933
4934	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4935	alu.last = 1;
4936	alu.dst.sel = ctx->bc->ar_reg;
4937	alu.dst.write = 1;
4938	r = r600_bytecode_add_alu(ctx->bc, &alu);
4939	if (r)
4940		return r;
4941
4942	ctx->bc->ar_loaded = 0;
4943	return 0;
4944}
4945static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4946{
4947	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4948	struct r600_bytecode_alu alu;
4949	int r;
4950
4951	switch (inst->Instruction.Opcode) {
4952	case TGSI_OPCODE_ARL:
4953		memset(&alu, 0, sizeof(alu));
4954		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4955		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4956		alu.dst.sel = ctx->bc->ar_reg;
4957		alu.dst.write = 1;
4958		alu.last = 1;
4959
4960		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4961			return r;
4962
4963		memset(&alu, 0, sizeof(alu));
4964		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4965		alu.src[0].sel = ctx->bc->ar_reg;
4966		alu.dst.sel = ctx->bc->ar_reg;
4967		alu.dst.write = 1;
4968		alu.last = 1;
4969
4970		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4971			return r;
4972		break;
4973	case TGSI_OPCODE_ARR:
4974		memset(&alu, 0, sizeof(alu));
4975		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4976		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4977		alu.dst.sel = ctx->bc->ar_reg;
4978		alu.dst.write = 1;
4979		alu.last = 1;
4980
4981		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4982			return r;
4983		break;
4984	case TGSI_OPCODE_UARL:
4985		memset(&alu, 0, sizeof(alu));
4986		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4987		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4988		alu.dst.sel = ctx->bc->ar_reg;
4989		alu.dst.write = 1;
4990		alu.last = 1;
4991
4992		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4993			return r;
4994		break;
4995	default:
4996		assert(0);
4997		return -1;
4998	}
4999
5000	ctx->bc->ar_loaded = 0;
5001	return 0;
5002}
5003
5004static int tgsi_opdst(struct r600_shader_ctx *ctx)
5005{
5006	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5007	struct r600_bytecode_alu alu;
5008	int i, r = 0;
5009
5010	for (i = 0; i < 4; i++) {
5011		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5012
5013		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
5014		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5015
5016		if (i == 0 || i == 3) {
5017			alu.src[0].sel = V_SQ_ALU_SRC_1;
5018		} else {
5019			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5020		}
5021
5022		if (i == 0 || i == 2) {
5023			alu.src[1].sel = V_SQ_ALU_SRC_1;
5024		} else {
5025			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5026		}
5027		if (i == 3)
5028			alu.last = 1;
5029		r = r600_bytecode_add_alu(ctx->bc, &alu);
5030		if (r)
5031			return r;
5032	}
5033	return 0;
5034}
5035
5036static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
5037{
5038	struct r600_bytecode_alu alu;
5039	int r;
5040
5041	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5042	alu.inst = opcode;
5043	alu.execute_mask = 1;
5044	alu.update_pred = 1;
5045
5046	alu.dst.sel = ctx->temp_reg;
5047	alu.dst.write = 1;
5048	alu.dst.chan = 0;
5049
5050	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5051	alu.src[1].sel = V_SQ_ALU_SRC_0;
5052	alu.src[1].chan = 0;
5053
5054	alu.last = 1;
5055
5056	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
5057	if (r)
5058		return r;
5059	return 0;
5060}
5061
5062static int pops(struct r600_shader_ctx *ctx, int pops)
5063{
5064	unsigned force_pop = ctx->bc->force_add_cf;
5065
5066	if (!force_pop) {
5067		int alu_pop = 3;
5068		if (ctx->bc->cf_last) {
5069			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
5070				alu_pop = 0;
5071			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
5072				alu_pop = 1;
5073		}
5074		alu_pop += pops;
5075		if (alu_pop == 1) {
5076			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
5077			ctx->bc->force_add_cf = 1;
5078		} else if (alu_pop == 2) {
5079			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
5080			ctx->bc->force_add_cf = 1;
5081		} else {
5082			force_pop = 1;
5083		}
5084	}
5085
5086	if (force_pop) {
5087		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
5088		ctx->bc->cf_last->pop_count = pops;
5089		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
5090	}
5091
5092	return 0;
5093}
5094
5095static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
5096{
5097	switch(reason) {
5098	case FC_PUSH_VPM:
5099		ctx->bc->callstack[ctx->bc->call_sp].current--;
5100		break;
5101	case FC_PUSH_WQM:
5102	case FC_LOOP:
5103		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
5104		break;
5105	case FC_REP:
5106		/* TOODO : for 16 vp asic should -= 2; */
5107		ctx->bc->callstack[ctx->bc->call_sp].current --;
5108		break;
5109	}
5110}
5111
5112static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
5113{
5114	if (check_max_only) {
5115		int diff;
5116		switch (reason) {
5117		case FC_PUSH_VPM:
5118			diff = 1;
5119			break;
5120		case FC_PUSH_WQM:
5121			diff = 4;
5122			break;
5123		default:
5124			assert(0);
5125			diff = 0;
5126		}
5127		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
5128		    ctx->bc->callstack[ctx->bc->call_sp].max) {
5129			ctx->bc->callstack[ctx->bc->call_sp].max =
5130				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
5131		}
5132		return;
5133	}
5134	switch (reason) {
5135	case FC_PUSH_VPM:
5136		ctx->bc->callstack[ctx->bc->call_sp].current++;
5137		break;
5138	case FC_PUSH_WQM:
5139	case FC_LOOP:
5140		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
5141		break;
5142	case FC_REP:
5143		ctx->bc->callstack[ctx->bc->call_sp].current++;
5144		break;
5145	}
5146
5147	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
5148	    ctx->bc->callstack[ctx->bc->call_sp].max) {
5149		ctx->bc->callstack[ctx->bc->call_sp].max =
5150			ctx->bc->callstack[ctx->bc->call_sp].current;
5151	}
5152}
5153
5154static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
5155{
5156	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
5157
5158	sp->mid = realloc((void *)sp->mid,
5159						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5160	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5161	sp->num_mid++;
5162}
5163
5164static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5165{
5166	ctx->bc->fc_sp++;
5167	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5168	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5169}
5170
5171static void fc_poplevel(struct r600_shader_ctx *ctx)
5172{
5173	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5174	free(sp->mid);
5175	sp->mid = NULL;
5176	sp->num_mid = 0;
5177	sp->start = NULL;
5178	sp->type = 0;
5179	ctx->bc->fc_sp--;
5180}
5181
5182#if 0
5183static int emit_return(struct r600_shader_ctx *ctx)
5184{
5185	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5186	return 0;
5187}
5188
5189static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5190{
5191
5192	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5193	ctx->bc->cf_last->pop_count = pops;
5194	/* XXX work out offset */
5195	return 0;
5196}
5197
5198static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5199{
5200	return 0;
5201}
5202
5203static void emit_testflag(struct r600_shader_ctx *ctx)
5204{
5205
5206}
5207
5208static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5209{
5210	emit_testflag(ctx);
5211	emit_jump_to_offset(ctx, 1, 4);
5212	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5213	pops(ctx, ifidx + 1);
5214	emit_return(ctx);
5215}
5216
5217static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5218{
5219	emit_testflag(ctx);
5220
5221	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5222	ctx->bc->cf_last->pop_count = 1;
5223
5224	fc_set_mid(ctx, fc_sp);
5225
5226	pops(ctx, 1);
5227}
5228#endif
5229
5230static int tgsi_if(struct r600_shader_ctx *ctx)
5231{
5232	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5233
5234	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5235
5236	fc_pushlevel(ctx, FC_IF);
5237
5238	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5239	return 0;
5240}
5241
5242static int tgsi_else(struct r600_shader_ctx *ctx)
5243{
5244	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5245	ctx->bc->cf_last->pop_count = 1;
5246
5247	fc_set_mid(ctx, ctx->bc->fc_sp);
5248	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5249	return 0;
5250}
5251
5252static int tgsi_endif(struct r600_shader_ctx *ctx)
5253{
5254	pops(ctx, 1);
5255	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5256		R600_ERR("if/endif unbalanced in shader\n");
5257		return -1;
5258	}
5259
5260	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5261		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5262		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5263	} else {
5264		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5265	}
5266	fc_poplevel(ctx);
5267
5268	callstack_decrease_current(ctx, FC_PUSH_VPM);
5269	return 0;
5270}
5271
5272static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5273{
5274	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
5275	 * limited to 4096 iterations, like the other LOOP_* instructions. */
5276	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
5277
5278	fc_pushlevel(ctx, FC_LOOP);
5279
5280	/* check stack depth */
5281	callstack_check_depth(ctx, FC_LOOP, 0);
5282	return 0;
5283}
5284
5285static int tgsi_endloop(struct r600_shader_ctx *ctx)
5286{
5287	int i;
5288
5289	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5290
5291	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5292		R600_ERR("loop/endloop in shader code are not paired.\n");
5293		return -EINVAL;
5294	}
5295
5296	/* fixup loop pointers - from r600isa
5297	   LOOP END points to CF after LOOP START,
5298	   LOOP START point to CF after LOOP END
5299	   BRK/CONT point to LOOP END CF
5300	*/
5301	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5302
5303	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5304
5305	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5306		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5307	}
5308	/* XXX add LOOPRET support */
5309	fc_poplevel(ctx);
5310	callstack_decrease_current(ctx, FC_LOOP);
5311	return 0;
5312}
5313
5314static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5315{
5316	unsigned int fscp;
5317
5318	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5319	{
5320		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5321			break;
5322	}
5323
5324	if (fscp == 0) {
5325		R600_ERR("Break not inside loop/endloop pair\n");
5326		return -EINVAL;
5327	}
5328
5329	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5330
5331	fc_set_mid(ctx, fscp);
5332
5333	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5334	return 0;
5335}
5336
5337static int tgsi_umad(struct r600_shader_ctx *ctx)
5338{
5339	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5340	struct r600_bytecode_alu alu;
5341	int i, j, r;
5342	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5343
5344	/* src0 * src1 */
5345	for (i = 0; i < lasti + 1; i++) {
5346		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5347			continue;
5348
5349		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5350
5351		alu.dst.chan = i;
5352		alu.dst.sel = ctx->temp_reg;
5353		alu.dst.write = 1;
5354
5355		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5356		for (j = 0; j < 2; j++) {
5357		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5358		}
5359
5360		alu.last = 1;
5361		r = r600_bytecode_add_alu(ctx->bc, &alu);
5362		if (r)
5363			return r;
5364	}
5365
5366
5367	for (i = 0; i < lasti + 1; i++) {
5368		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5369			continue;
5370
5371		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5372		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5373
5374		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5375
5376		alu.src[0].sel = ctx->temp_reg;
5377		alu.src[0].chan = i;
5378
5379		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5380		if (i == lasti) {
5381			alu.last = 1;
5382		}
5383		r = r600_bytecode_add_alu(ctx->bc, &alu);
5384		if (r)
5385			return r;
5386	}
5387	return 0;
5388}
5389
5390static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5391	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5392	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5393	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5394
5395	/* XXX:
5396	 * For state trackers other than OpenGL, we'll want to use
5397	 * _RECIP_IEEE instead.
5398	 */
5399	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5400
5401	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5402	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5403	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5404	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5405	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5406	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5407	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5408	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5409	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5410	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5411	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5412	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5413	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5414	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5415	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5416	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5417	/* gap */
5418	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5419	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5420	/* gap */
5421	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5422	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5423	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5424	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5425	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5426	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5427	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5428	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5429	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5430	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5431	/* gap */
5432	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5433	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5434	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5435	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5436	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5437	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5438	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5439	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5440	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5441	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5442	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5443	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5444	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5445	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5446	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5447	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5448	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5449	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5450	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5451	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5452	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5453	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5454	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5455	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5456	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5457	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5458	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5459	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5460	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5461	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5462	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5463	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5464	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5465	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5466	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5467	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5468	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5469	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5470	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5472	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5473	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5474	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5475	/* gap */
5476	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5477	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5478	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5479	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5480	/* gap */
5481	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5482	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5483	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5484	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5485	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5486	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5487	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5488	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5489	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5490	/* gap */
5491	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5492	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5493	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5494	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5495	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5496	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5498	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5499	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5500	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5502	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5503	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5504	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5505	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5506	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5507	/* gap */
5508	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5509	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5510	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5511	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5512	/* gap */
5513	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5514	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5515	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5516	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5517	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5518	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5519	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5520	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5521	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5522	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5523	/* gap */
5524	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5525	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5526	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5527	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5528	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5529	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5530	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5531	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5532	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5533	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5534	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5535	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5536	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5537	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5538	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5539	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5540	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5541	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5542	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5543	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5544	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5545	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5546	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5547	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5548	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5549	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5550	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5551	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5552	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5553	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5554	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5555	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5556	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5557	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5558	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5559	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5560	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5561	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5562	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5563	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5564	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5565	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5566	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5567	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5568};
5569
5570static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5571	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5572	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5573	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5574	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5575	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5576	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5577	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5578	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5579	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5580	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5581	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5582	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5583	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5584	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5585	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5586	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5587	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5588	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5589	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5590	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5591	/* gap */
5592	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5593	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5594	/* gap */
5595	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5596	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5597	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5598	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5599	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5600	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5601	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5602	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5603	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5604	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5605	/* gap */
5606	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5607	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5608	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5609	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5610	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5611	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5612	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5613	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5614	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5615	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5616	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5617	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5618	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5619	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5620	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5621	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5622	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5623	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5624	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5625	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5626	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5627	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5628	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5629	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5630	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5631	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5632	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5633	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5634	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5635	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5636	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5637	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5638	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5639	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5640	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5641	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5642	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5643	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5644	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5646	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5647	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5648	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5649	/* gap */
5650	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5651	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5652	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5653	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5654	/* gap */
5655	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5656	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5657	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5658	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5659	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5660	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5661	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5662	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5663	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5664	/* gap */
5665	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5666	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5667	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5668	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5669	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5670	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5672	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5673	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5674	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5676	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5677	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5678	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5679	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5680	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5681	/* gap */
5682	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5683	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5684	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5685	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5686	/* gap */
5687	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5688	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5689	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5690	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5691	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5692	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5693	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5694	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5695	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5696	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5697	/* gap */
5698	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5699	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5700	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5701	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5702	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5703	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5704	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5705	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5706	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5707	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5708	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5709	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5710	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5711	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5712	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5713	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5714	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5715	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5716	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5717	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5718	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5719	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5720	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5721	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5722	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5723	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5724	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5725	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5726	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5727	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5728	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5729	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5730	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5731	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5732	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5733	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5734	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5735	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5736	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5737	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5738	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5739	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5740	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5741	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5742};
5743
5744static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5745	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5746	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5747	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5748	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5749	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5750	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5751	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5752	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5753	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5754	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5755	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5756	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5757	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5758	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5759	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5760	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5761	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5762	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5763	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5764	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5765	/* gap */
5766	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5767	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5768	/* gap */
5769	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5770	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5771	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5772	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5773	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5774	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5775	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5776	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5777	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5778	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5779	/* gap */
5780	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5781	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5782	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5783	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5784	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5785	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5786	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5787	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5788	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5789	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5790	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5791	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5792	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5793	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5794	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5795	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5796	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5797	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5798	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5799	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5800	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5801	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5802	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5803	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5804	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5805	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5806	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5807	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5808	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5809	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5810	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5811	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5812	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5813	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5814	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5815	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5816	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5817	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5818	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5819	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5820	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5821	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5822	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5823	/* gap */
5824	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5825	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5826	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5827	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5828	/* gap */
5829	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5830	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5831	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5832	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5833	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5834	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5835	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5836	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5837	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5838	/* gap */
5839	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5840	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5841	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5842	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5843	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5844	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5845	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5846	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5847	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5848	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5849	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5850	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5851	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5852	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5853	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5854	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5855	/* gap */
5856	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5857	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5858	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5859	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5860	/* gap */
5861	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5862	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5863	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5864	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5865	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5866	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5867	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5868	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5869	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5870	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5871	/* gap */
5872	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5873	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5874	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5875	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5876	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5877	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5878	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5879	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5880	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5881	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5882	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5883	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5884	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5885	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5886	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5887	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5888	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5889	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5890	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5891	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5892	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5893	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5894	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5895	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5896	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5897	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5898	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5899	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5900	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5901	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5902	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5903	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5904	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5905	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5906	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5907	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5908	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5909	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5910	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5911	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5912	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5913	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5914	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5915	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5916};
5917