r600_shader.c revision 7d532800d8be5ce31731658564691ae9cdaacf7a
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "pipe/p_shader_tokens.h"
31#include "tgsi/tgsi_info.h"
32#include "tgsi/tgsi_parse.h"
33#include "tgsi/tgsi_scan.h"
34#include "tgsi/tgsi_dump.h"
35#include "util/u_memory.h"
36#include <stdio.h>
37#include <errno.h>
38#include <byteswap.h>
39
40/* CAYMAN notes
41Why CAYMAN got loops for lots of instructions is explained here.
42
43-These 8xx t-slot only ops are implemented in all vector slots.
44MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
45These 8xx t-slot only opcodes become vector ops, with all four
46slots expecting the arguments on sources a and b. Result is
47broadcast to all channels.
48MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
49These 8xx t-slot only opcodes become vector ops in the z, y, and
50x slots.
51EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
52RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
53SQRT_IEEE/_64
54SIN/COS
55The w slot may have an independent co-issued operation, or if the
56result is required to be in the w slot, the opcode above may be
57issued in the w slot as well.
58The compiler must issue the source argument to slots z, y, and x
59*/
60
61static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
62{
63	struct r600_context *rctx = (struct r600_context *)ctx;
64	struct r600_shader *rshader = &shader->shader;
65	uint32_t *ptr;
66	int	i;
67
68	/* copy new shader */
69	if (shader->bo == NULL) {
70		shader->bo = (struct r600_resource*)
71			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
72		if (shader->bo == NULL) {
73			return -ENOMEM;
74		}
75		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
76		if (R600_BIG_ENDIAN) {
77			for (i = 0; i < rshader->bc.ndw; ++i) {
78				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
79			}
80		} else {
81			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
82		}
83		rctx->ws->buffer_unmap(shader->bo->cs_buf);
84	}
85	/* build state */
86	switch (rshader->processor_type) {
87	case TGSI_PROCESSOR_VERTEX:
88		if (rctx->chip_class >= EVERGREEN) {
89			evergreen_pipe_shader_vs(ctx, shader);
90		} else {
91			r600_pipe_shader_vs(ctx, shader);
92		}
93		break;
94	case TGSI_PROCESSOR_FRAGMENT:
95		if (rctx->chip_class >= EVERGREEN) {
96			evergreen_pipe_shader_ps(ctx, shader);
97		} else {
98			r600_pipe_shader_ps(ctx, shader);
99		}
100		break;
101	default:
102		return -EINVAL;
103	}
104	return 0;
105}
106
107static int r600_shader_from_tgsi(struct r600_screen *rscreen,
108				 struct r600_pipe_shader *pipeshader,
109				 struct r600_shader_key key);
110
111static void r600_dump_streamout(struct pipe_stream_output_info *so)
112{
113	unsigned i;
114
115	fprintf(stderr, "STREAMOUT\n");
116	for (i = 0; i < so->num_outputs; i++) {
117		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
118				so->output[i].start_component;
119		fprintf(stderr, "  %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
120			i, so->output[i].output_buffer,
121			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
122			so->output[i].register_index,
123			mask & 1 ? "x" : "",
124		        mask & 2 ? "y" : "",
125		        mask & 4 ? "z" : "",
126		        mask & 8 ? "w" : "",
127			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
128	}
129}
130
131int r600_pipe_shader_create(struct pipe_context *ctx,
132			    struct r600_pipe_shader *shader,
133			    struct r600_shader_key key)
134{
135	static int dump_shaders = -1;
136	struct r600_context *rctx = (struct r600_context *)ctx;
137	struct r600_pipe_shader_selector *sel = shader->selector;
138	int r;
139
140	/* Would like some magic "get_bool_option_once" routine.
141	*/
142	if (dump_shaders == -1)
143		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
144
145	if (dump_shaders) {
146		fprintf(stderr, "--------------------------------------------------------------\n");
147		tgsi_dump(sel->tokens, 0);
148
149		if (sel->so.num_outputs) {
150			r600_dump_streamout(&sel->so);
151		}
152	}
153	r = r600_shader_from_tgsi(rctx->screen, shader, key);
154	if (r) {
155		R600_ERR("translation from TGSI failed !\n");
156		return r;
157	}
158	r = r600_bytecode_build(&shader->shader.bc);
159	if (r) {
160		R600_ERR("building bytecode failed !\n");
161		return r;
162	}
163	if (dump_shaders) {
164		r600_bytecode_dump(&shader->shader.bc);
165		fprintf(stderr, "______________________________________________________________\n");
166	}
167	return r600_pipe_shader(ctx, shader);
168}
169
170void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
171{
172	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
173	r600_bytecode_clear(&shader->shader.bc);
174}
175
176/*
177 * tgsi -> r600 shader
178 */
179struct r600_shader_tgsi_instruction;
180
181struct r600_shader_src {
182	unsigned				sel;
183	unsigned				swizzle[4];
184	unsigned				neg;
185	unsigned				abs;
186	unsigned				rel;
187	unsigned				kc_bank;
188	uint32_t				value[4];
189};
190
191struct r600_shader_ctx {
192	struct tgsi_shader_info			info;
193	struct tgsi_parse_context		parse;
194	const struct tgsi_token			*tokens;
195	unsigned				type;
196	unsigned				file_offset[TGSI_FILE_COUNT];
197	unsigned				temp_reg;
198	struct r600_shader_tgsi_instruction	*inst_info;
199	struct r600_bytecode			*bc;
200	struct r600_shader			*shader;
201	struct r600_shader_src			src[4];
202	uint32_t				*literals;
203	uint32_t				nliterals;
204	uint32_t				max_driver_temp_used;
205	boolean use_llvm;
206	/* needed for evergreen interpolation */
207	boolean                                 input_centroid;
208	boolean                                 input_linear;
209	boolean                                 input_perspective;
210	int					num_interp_gpr;
211	int					face_gpr;
212	int					colors_used;
213	boolean                 clip_vertex_write;
214	unsigned                cv_output;
215	int					fragcoord_input;
216	int					native_integers;
217};
218
219struct r600_shader_tgsi_instruction {
220	unsigned	tgsi_opcode;
221	unsigned	is_op3;
222	unsigned	r600_opcode;
223	int (*process)(struct r600_shader_ctx *ctx);
224};
225
226static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
227static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
228static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
229static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
230static int tgsi_else(struct r600_shader_ctx *ctx);
231static int tgsi_endif(struct r600_shader_ctx *ctx);
232static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
233static int tgsi_endloop(struct r600_shader_ctx *ctx);
234static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
235
236/*
237 * bytestream -> r600 shader
238 *
239 * These functions are used to transform the output of the LLVM backend into
240 * struct r600_bytecode.
241 */
242
243static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
244				unsigned char * bytes,	unsigned num_bytes);
245
246#ifdef HAVE_OPENCL
247int r600_compute_shader_create(struct pipe_context * ctx,
248	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
249{
250	struct r600_context *r600_ctx = (struct r600_context *)ctx;
251	unsigned char * bytes;
252	unsigned byte_count;
253	struct r600_shader_ctx shader_ctx;
254	unsigned dump = 0;
255
256	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
257		dump = 1;
258	}
259
260	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
261	shader_ctx.bc = bytecode;
262	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family,
263			   r600_ctx->screen->msaa_texture_support);
264	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
265	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
266	if (shader_ctx.bc->chip_class == CAYMAN) {
267		cm_bytecode_add_cf_end(shader_ctx.bc);
268	}
269	r600_bytecode_build(shader_ctx.bc);
270	if (dump) {
271		r600_bytecode_dump(shader_ctx.bc);
272	}
273	free(bytes);
274	return 1;
275}
276
277#endif /* HAVE_OPENCL */
278
279static uint32_t i32_from_byte_stream(unsigned char * bytes,
280		unsigned * bytes_read)
281{
282	unsigned i;
283	uint32_t out = 0;
284	for (i = 0; i < 4; i++) {
285		out |= bytes[(*bytes_read)++] << (8 * i);
286	}
287	return out;
288}
289
290static unsigned r600_src_from_byte_stream(unsigned char * bytes,
291		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
292{
293	unsigned i;
294	unsigned sel0, sel1;
295	sel0 = bytes[bytes_read++];
296	sel1 = bytes[bytes_read++];
297	alu->src[src_idx].sel = sel0 | (sel1 << 8);
298	alu->src[src_idx].chan = bytes[bytes_read++];
299	alu->src[src_idx].neg = bytes[bytes_read++];
300	alu->src[src_idx].abs = bytes[bytes_read++];
301	alu->src[src_idx].rel = bytes[bytes_read++];
302	alu->src[src_idx].kc_bank = bytes[bytes_read++];
303	for (i = 0; i < 4; i++) {
304		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
305	}
306	return bytes_read;
307}
308
309static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
310				unsigned char * bytes, unsigned bytes_read)
311{
312	unsigned src_idx, src_num;
313	struct r600_bytecode_alu alu;
314	unsigned src_use_sel[3];
315	unsigned src_sel[3] = {};
316	uint32_t word0, word1;
317
318	src_num = bytes[bytes_read++];
319
320	memset(&alu, 0, sizeof(alu));
321	for(src_idx = 0; src_idx < src_num; src_idx++) {
322		unsigned i;
323		src_use_sel[src_idx] = bytes[bytes_read++];
324		for (i = 0; i < 4; i++) {
325			src_sel[src_idx] |= bytes[bytes_read++] << (i * 8);
326		}
327		for (i = 0; i < 4; i++) {
328			alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
329		}
330	}
331
332	word0 = i32_from_byte_stream(bytes, &bytes_read);
333	word1 = i32_from_byte_stream(bytes, &bytes_read);
334
335	switch(ctx->bc->chip_class) {
336	default:
337	case R600:
338		r600_bytecode_alu_read(&alu, word0, word1);
339		break;
340	case R700:
341	case EVERGREEN:
342	case CAYMAN:
343		r700_bytecode_alu_read(&alu, word0, word1);
344		break;
345	}
346
347	for(src_idx = 0; src_idx < src_num; src_idx++) {
348		if (src_use_sel[src_idx]) {
349			unsigned sel = src_sel[src_idx];
350
351			alu.src[src_idx].chan = sel & 3;
352			sel >>= 2;
353
354			if (sel>=512) { /* constant */
355				sel -= 512;
356				alu.src[src_idx].kc_bank = sel >> 12;
357				alu.src[src_idx].sel = (sel & 4095) + 512;
358			}
359			else {
360				alu.src[src_idx].sel = sel;
361			}
362		}
363	}
364
365#if HAVE_LLVM < 0x0302
366	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
367	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
368	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
369	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
370		alu.update_pred = 1;
371		alu.dst.write = 0;
372		alu.src[1].sel = V_SQ_ALU_SRC_0;
373		alu.src[1].chan = 0;
374		alu.last = 1;
375	}
376#endif
377
378	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT)) {
379		ctx->bc->ar_reg = alu.src[0].sel;
380		ctx->bc->ar_loaded = 0;
381		return bytes_read;
382	}
383
384	if (alu.execute_mask) {
385		alu.pred_sel = 0;
386		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
387	} else {
388		r600_bytecode_add_alu(ctx->bc, &alu);
389	}
390
391	/* XXX: Handle other KILL instructions */
392	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
393		ctx->shader->uses_kill = 1;
394		/* XXX: This should be enforced in the LLVM backend. */
395		ctx->bc->force_add_cf = 1;
396	}
397	return bytes_read;
398}
399
400static void llvm_if(struct r600_shader_ctx *ctx)
401{
402	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
403	fc_pushlevel(ctx, FC_IF);
404	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
405}
406
407static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
408{
409	unsigned opcode = TGSI_OPCODE_BRK;
410	if (ctx->bc->chip_class == CAYMAN)
411		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
412	else if (ctx->bc->chip_class >= EVERGREEN)
413		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
414	else
415		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
416	llvm_if(ctx);
417	tgsi_loop_brk_cont(ctx);
418	tgsi_endif(ctx);
419}
420
421static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
422				unsigned char * bytes, unsigned bytes_read)
423{
424	struct r600_bytecode_alu alu;
425	unsigned inst;
426	memset(&alu, 0, sizeof(alu));
427	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
428	inst = bytes[bytes_read++];
429	switch (inst) {
430	case 0: /* IF_PREDICATED */
431		llvm_if(ctx);
432		break;
433	case 1: /* ELSE */
434		tgsi_else(ctx);
435		break;
436	case 2: /* ENDIF */
437		tgsi_endif(ctx);
438		break;
439	case 3: /* BGNLOOP */
440		tgsi_bgnloop(ctx);
441		break;
442	case 4: /* ENDLOOP */
443		tgsi_endloop(ctx);
444		break;
445	case 5: /* PREDICATED_BREAK */
446		r600_break_from_byte_stream(ctx);
447		break;
448	case 6: /* CONTINUE */
449		{
450			unsigned opcode = TGSI_OPCODE_CONT;
451			if (ctx->bc->chip_class == CAYMAN) {
452				ctx->inst_info =
453					&cm_shader_tgsi_instruction[opcode];
454			} else if (ctx->bc->chip_class >= EVERGREEN) {
455				ctx->inst_info =
456					&eg_shader_tgsi_instruction[opcode];
457			} else {
458				ctx->inst_info =
459					&r600_shader_tgsi_instruction[opcode];
460			}
461			tgsi_loop_brk_cont(ctx);
462		}
463		break;
464	}
465
466	return bytes_read;
467}
468
469static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
470				unsigned char * bytes, unsigned bytes_read)
471{
472	struct r600_bytecode_tex tex;
473
474	tex.inst = bytes[bytes_read++];
475	tex.resource_id = bytes[bytes_read++];
476	tex.src_gpr = bytes[bytes_read++];
477	tex.src_rel = bytes[bytes_read++];
478	tex.dst_gpr = bytes[bytes_read++];
479	tex.dst_rel = bytes[bytes_read++];
480	tex.dst_sel_x = bytes[bytes_read++];
481	tex.dst_sel_y = bytes[bytes_read++];
482	tex.dst_sel_z = bytes[bytes_read++];
483	tex.dst_sel_w = bytes[bytes_read++];
484	tex.lod_bias = bytes[bytes_read++];
485	tex.coord_type_x = bytes[bytes_read++];
486	tex.coord_type_y = bytes[bytes_read++];
487	tex.coord_type_z = bytes[bytes_read++];
488	tex.coord_type_w = bytes[bytes_read++];
489	tex.offset_x = bytes[bytes_read++];
490	tex.offset_y = bytes[bytes_read++];
491	tex.offset_z = bytes[bytes_read++];
492	tex.sampler_id = bytes[bytes_read++];
493	tex.src_sel_x = bytes[bytes_read++];
494	tex.src_sel_y = bytes[bytes_read++];
495	tex.src_sel_z = bytes[bytes_read++];
496	tex.src_sel_w = bytes[bytes_read++];
497
498	tex.inst_mod = 0;
499
500	r600_bytecode_add_tex(ctx->bc, &tex);
501
502	return bytes_read;
503}
504
505static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
506	unsigned char * bytes, unsigned bytes_read)
507{
508	struct r600_bytecode_vtx vtx;
509
510	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
511        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
512	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
513
514	memset(&vtx, 0, sizeof(vtx));
515
516	/* WORD0 */
517	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
518	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
519	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
520	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
521	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
522	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
523
524	/* WORD1 */
525	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
526	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
527	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
528	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
529	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
530	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
531	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
532	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
533	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
534	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
535
536	/* WORD 2*/
537	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
538	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
539
540	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
541		fprintf(stderr, "Error adding vtx\n");
542	}
543
544	/* Use the Texture Cache for compute shaders*/
545	if (ctx->bc->chip_class >= EVERGREEN &&
546		ctx->bc->type == TGSI_PROCESSOR_COMPUTE) {
547		ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
548	}
549	return bytes_read;
550}
551
552static int r600_export_from_byte_stream(struct r600_shader_ctx *ctx,
553	unsigned char * bytes, unsigned bytes_read)
554{
555	uint32_t word0 = 0, word1 = 0;
556	struct r600_bytecode_output output;
557	memset(&output, 0, sizeof(struct r600_bytecode_output));
558	word0 = i32_from_byte_stream(bytes, &bytes_read);
559	word1 = i32_from_byte_stream(bytes, &bytes_read);
560	if (ctx->bc->chip_class >= EVERGREEN)
561		eg_bytecode_export_read(&output, word0,word1);
562	else
563		r600_bytecode_export_read(&output, word0,word1);
564	r600_bytecode_add_output(ctx->bc, &output);
565	return bytes_read;
566}
567
568static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
569				unsigned char * bytes,	unsigned num_bytes)
570{
571	unsigned bytes_read = 0;
572	unsigned i, byte;
573	while (bytes_read < num_bytes) {
574		char inst_type = bytes[bytes_read++];
575		switch (inst_type) {
576		case 0:
577			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
578								bytes_read);
579			break;
580		case 1:
581			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
582								bytes_read);
583			break;
584		case 2:
585			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
586								bytes_read);
587			break;
588		case 3:
589			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
590			for (i = 0; i < 2; i++) {
591				for (byte = 0 ; byte < 4; byte++) {
592					ctx->bc->cf_last->isa[i] |=
593					(bytes[bytes_read++] << (byte * 8));
594				}
595			}
596			break;
597
598		case 4:
599			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
600								bytes_read);
601			break;
602		case 5:
603            bytes_read = r600_export_from_byte_stream(ctx, bytes,
604                                bytes_read);
605            break;
606		default:
607			/* XXX: Error here */
608			break;
609		}
610	}
611}
612
613/* End bytestream -> r600 shader functions*/
614
615static int tgsi_is_supported(struct r600_shader_ctx *ctx)
616{
617	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
618	int j;
619
620	if (i->Instruction.NumDstRegs > 1) {
621		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
622		return -EINVAL;
623	}
624	if (i->Instruction.Predicate) {
625		R600_ERR("predicate unsupported\n");
626		return -EINVAL;
627	}
628#if 0
629	if (i->Instruction.Label) {
630		R600_ERR("label unsupported\n");
631		return -EINVAL;
632	}
633#endif
634	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
635		if (i->Src[j].Register.Dimension) {
636		   if (i->Src[j].Register.File != TGSI_FILE_CONSTANT) {
637			   R600_ERR("unsupported src %d (dimension %d)\n", j,
638				    i->Src[j].Register.Dimension);
639			   return -EINVAL;
640		   }
641		}
642	}
643	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
644		if (i->Dst[j].Register.Dimension) {
645			R600_ERR("unsupported dst (dimension)\n");
646			return -EINVAL;
647		}
648	}
649	return 0;
650}
651
652static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
653{
654	int i, r;
655	struct r600_bytecode_alu alu;
656	int gpr = 0, base_chan = 0;
657	int ij_index = 0;
658
659	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
660		ij_index = 0;
661		if (ctx->shader->input[input].centroid)
662			ij_index++;
663	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
664		ij_index = 0;
665		/* if we have perspective add one */
666		if (ctx->input_perspective)  {
667			ij_index++;
668			/* if we have perspective centroid */
669			if (ctx->input_centroid)
670				ij_index++;
671		}
672		if (ctx->shader->input[input].centroid)
673			ij_index++;
674	}
675
676	/* work out gpr and base_chan from index */
677	gpr = ij_index / 2;
678	base_chan = (2 * (ij_index % 2)) + 1;
679
680	for (i = 0; i < 8; i++) {
681		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
682
683		if (i < 4)
684			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
685		else
686			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
687
688		if ((i > 1) && (i < 6)) {
689			alu.dst.sel = ctx->shader->input[input].gpr;
690			alu.dst.write = 1;
691		}
692
693		alu.dst.chan = i % 4;
694
695		alu.src[0].sel = gpr;
696		alu.src[0].chan = (base_chan - (i % 2));
697
698		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
699
700		alu.bank_swizzle_force = SQ_ALU_VEC_210;
701		if ((i % 4) == 3)
702			alu.last = 1;
703		r = r600_bytecode_add_alu(ctx->bc, &alu);
704		if (r)
705			return r;
706	}
707	return 0;
708}
709
710static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
711{
712	int i, r;
713	struct r600_bytecode_alu alu;
714
715	for (i = 0; i < 4; i++) {
716		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
717
718		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
719
720		alu.dst.sel = ctx->shader->input[input].gpr;
721		alu.dst.write = 1;
722
723		alu.dst.chan = i;
724
725		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
726		alu.src[0].chan = i;
727
728		if (i == 3)
729			alu.last = 1;
730		r = r600_bytecode_add_alu(ctx->bc, &alu);
731		if (r)
732			return r;
733	}
734	return 0;
735}
736
737/*
738 * Special export handling in shaders
739 *
740 * shader export ARRAY_BASE for EXPORT_POS:
741 * 60 is position
742 * 61 is misc vector
743 * 62, 63 are clip distance vectors
744 *
745 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
746 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
747 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
748 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
749 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
750 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
751 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
752 * exclusive from render target index)
753 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
754 *
755 *
756 * shader export ARRAY_BASE for EXPORT_PIXEL:
757 * 0-7 CB targets
758 * 61 computed Z vector
759 *
760 * The use of the values exported in the computed Z vector are controlled
761 * by DB_SHADER_CONTROL:
762 * Z_EXPORT_ENABLE - Z as a float in RED
763 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
764 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
765 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
766 * DB_SOURCE_FORMAT - export control restrictions
767 *
768 */
769
770
771/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
772static int r600_spi_sid(struct r600_shader_io * io)
773{
774	int index, name = io->name;
775
776	/* These params are handled differently, they don't need
777	 * semantic indices, so we'll use 0 for them.
778	 */
779	if (name == TGSI_SEMANTIC_POSITION ||
780		name == TGSI_SEMANTIC_PSIZE ||
781		name == TGSI_SEMANTIC_FACE)
782		index = 0;
783	else {
784		if (name == TGSI_SEMANTIC_GENERIC) {
785			/* For generic params simply use sid from tgsi */
786			index = io->sid;
787		} else {
788			/* For non-generic params - pack name and sid into 8 bits */
789			index = 0x80 | (name<<3) | (io->sid);
790		}
791
792		/* Make sure that all really used indices have nonzero value, so
793		 * we can just compare it to 0 later instead of comparing the name
794		 * with different values to detect special cases. */
795		index++;
796	}
797
798	return index;
799};
800
801/* turn input into interpolate on EG */
802static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
803{
804	int r = 0;
805
806	if (ctx->shader->input[index].spi_sid) {
807		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
808		if (!ctx->use_llvm) {
809			if (ctx->shader->input[index].interpolate > 0) {
810				r = evergreen_interp_alu(ctx, index);
811			} else {
812				r = evergreen_interp_flat(ctx, index);
813			}
814		}
815	}
816	return r;
817}
818
819static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
820{
821	struct r600_bytecode_alu alu;
822	int i, r;
823	int gpr_front = ctx->shader->input[front].gpr;
824	int gpr_back = ctx->shader->input[back].gpr;
825
826	for (i = 0; i < 4; i++) {
827		memset(&alu, 0, sizeof(alu));
828		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
829		alu.is_op3 = 1;
830		alu.dst.write = 1;
831		alu.dst.sel = gpr_front;
832		alu.src[0].sel = ctx->face_gpr;
833		alu.src[1].sel = gpr_front;
834		alu.src[2].sel = gpr_back;
835
836		alu.dst.chan = i;
837		alu.src[1].chan = i;
838		alu.src[2].chan = i;
839		alu.last = (i==3);
840
841		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
842			return r;
843	}
844
845	return 0;
846}
847
848static int tgsi_declaration(struct r600_shader_ctx *ctx)
849{
850	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
851	unsigned i;
852	int r;
853
854	switch (d->Declaration.File) {
855	case TGSI_FILE_INPUT:
856		i = ctx->shader->ninput++;
857		ctx->shader->input[i].name = d->Semantic.Name;
858		ctx->shader->input[i].sid = d->Semantic.Index;
859		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
860		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
861		ctx->shader->input[i].centroid = d->Interp.Centroid;
862		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
863		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
864			switch (ctx->shader->input[i].name) {
865			case TGSI_SEMANTIC_FACE:
866				ctx->face_gpr = ctx->shader->input[i].gpr;
867				break;
868			case TGSI_SEMANTIC_COLOR:
869				ctx->colors_used++;
870				break;
871			case TGSI_SEMANTIC_POSITION:
872				ctx->fragcoord_input = i;
873				break;
874			}
875			if (ctx->bc->chip_class >= EVERGREEN) {
876				if ((r = evergreen_interp_input(ctx, i)))
877					return r;
878			}
879		}
880		break;
881	case TGSI_FILE_OUTPUT:
882		i = ctx->shader->noutput++;
883		ctx->shader->output[i].name = d->Semantic.Name;
884		ctx->shader->output[i].sid = d->Semantic.Index;
885		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
886		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
887		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
888		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
889		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
890			switch (d->Semantic.Name) {
891			case TGSI_SEMANTIC_CLIPDIST:
892				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
893				break;
894			case TGSI_SEMANTIC_PSIZE:
895				ctx->shader->vs_out_misc_write = 1;
896				ctx->shader->vs_out_point_size = 1;
897				break;
898			case TGSI_SEMANTIC_CLIPVERTEX:
899				ctx->clip_vertex_write = TRUE;
900				ctx->cv_output = i;
901				break;
902			}
903		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
904			switch (d->Semantic.Name) {
905			case TGSI_SEMANTIC_COLOR:
906				ctx->shader->nr_ps_max_color_exports++;
907				break;
908			}
909		}
910		break;
911	case TGSI_FILE_CONSTANT:
912	case TGSI_FILE_TEMPORARY:
913	case TGSI_FILE_SAMPLER:
914	case TGSI_FILE_ADDRESS:
915		break;
916
917	case TGSI_FILE_SYSTEM_VALUE:
918		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
919			if (!ctx->native_integers) {
920				struct r600_bytecode_alu alu;
921				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
922
923				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
924				alu.src[0].sel = 0;
925				alu.src[0].chan = 3;
926
927				alu.dst.sel = 0;
928				alu.dst.chan = 3;
929				alu.dst.write = 1;
930				alu.last = 1;
931
932				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
933					return r;
934			}
935			break;
936		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
937			break;
938	default:
939		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
940		return -EINVAL;
941	}
942	return 0;
943}
944
945static int r600_get_temp(struct r600_shader_ctx *ctx)
946{
947	return ctx->temp_reg + ctx->max_driver_temp_used++;
948}
949
950/*
951 * for evergreen we need to scan the shader to find the number of GPRs we need to
952 * reserve for interpolation.
953 *
954 * we need to know if we are going to emit
955 * any centroid inputs
956 * if perspective and linear are required
957*/
958static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
959{
960	int i;
961	int num_baryc;
962
963	ctx->input_linear = FALSE;
964	ctx->input_perspective = FALSE;
965	ctx->input_centroid = FALSE;
966	ctx->num_interp_gpr = 1;
967
968	/* any centroid inputs */
969	for (i = 0; i < ctx->info.num_inputs; i++) {
970		/* skip position/face */
971		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
972		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
973			continue;
974		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
975			ctx->input_linear = TRUE;
976		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
977			ctx->input_perspective = TRUE;
978		if (ctx->info.input_centroid[i])
979			ctx->input_centroid = TRUE;
980	}
981
982	num_baryc = 0;
983	/* ignoring sample for now */
984	if (ctx->input_perspective)
985		num_baryc++;
986	if (ctx->input_linear)
987		num_baryc++;
988	if (ctx->input_centroid)
989		num_baryc *= 2;
990
991	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
992
993	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
994	return ctx->num_interp_gpr;
995}
996
997static void tgsi_src(struct r600_shader_ctx *ctx,
998		     const struct tgsi_full_src_register *tgsi_src,
999		     struct r600_shader_src *r600_src)
1000{
1001	memset(r600_src, 0, sizeof(*r600_src));
1002	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1003	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1004	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1005	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1006	r600_src->neg = tgsi_src->Register.Negate;
1007	r600_src->abs = tgsi_src->Register.Absolute;
1008
1009	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1010		int index;
1011		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1012			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1013			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1014
1015			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1016			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
1017			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1018				return;
1019		}
1020		index = tgsi_src->Register.Index;
1021		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1022		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1023	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1024		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1025			r600_src->swizzle[0] = 3;
1026			r600_src->swizzle[1] = 3;
1027			r600_src->swizzle[2] = 3;
1028			r600_src->swizzle[3] = 3;
1029			r600_src->sel = 0;
1030		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1031			r600_src->swizzle[0] = 0;
1032			r600_src->swizzle[1] = 0;
1033			r600_src->swizzle[2] = 0;
1034			r600_src->swizzle[3] = 0;
1035			r600_src->sel = 0;
1036		}
1037	} else {
1038		if (tgsi_src->Register.Indirect)
1039			r600_src->rel = V_SQ_REL_RELATIVE;
1040		r600_src->sel = tgsi_src->Register.Index;
1041		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1042	}
1043	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1044		if (tgsi_src->Register.Dimension) {
1045			r600_src->kc_bank = tgsi_src->Dimension.Index;
1046		}
1047	}
1048}
1049
1050static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int cb_idx, unsigned int offset, unsigned int dst_reg)
1051{
1052	struct r600_bytecode_vtx vtx;
1053	unsigned int ar_reg;
1054	int r;
1055
1056	if (offset) {
1057		struct r600_bytecode_alu alu;
1058
1059		memset(&alu, 0, sizeof(alu));
1060
1061		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
1062		alu.src[0].sel = ctx->bc->ar_reg;
1063
1064		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1065		alu.src[1].value = offset;
1066
1067		alu.dst.sel = dst_reg;
1068		alu.dst.write = 1;
1069		alu.last = 1;
1070
1071		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1072			return r;
1073
1074		ar_reg = dst_reg;
1075	} else {
1076		ar_reg = ctx->bc->ar_reg;
1077	}
1078
1079	memset(&vtx, 0, sizeof(vtx));
1080	vtx.buffer_id = cb_idx;
1081	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1082	vtx.src_gpr = ar_reg;
1083	vtx.mega_fetch_count = 16;
1084	vtx.dst_gpr = dst_reg;
1085	vtx.dst_sel_x = 0;		/* SEL_X */
1086	vtx.dst_sel_y = 1;		/* SEL_Y */
1087	vtx.dst_sel_z = 2;		/* SEL_Z */
1088	vtx.dst_sel_w = 3;		/* SEL_W */
1089	vtx.data_format = FMT_32_32_32_32_FLOAT;
1090	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1091	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1092	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1093	vtx.endian = r600_endian_swap(32);
1094
1095	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1096		return r;
1097
1098	return 0;
1099}
1100
1101static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1102{
1103	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1104	struct r600_bytecode_alu alu;
1105	int i, j, k, nconst, r;
1106
1107	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1108		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1109			nconst++;
1110		}
1111		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1112	}
1113	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1114		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1115			continue;
1116		}
1117
1118		if (ctx->src[i].rel) {
1119			int treg = r600_get_temp(ctx);
1120			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, treg)))
1121				return r;
1122
1123			ctx->src[i].kc_bank = 0;
1124			ctx->src[i].sel = treg;
1125			ctx->src[i].rel = 0;
1126			j--;
1127		} else if (j > 0) {
1128			int treg = r600_get_temp(ctx);
1129			for (k = 0; k < 4; k++) {
1130				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1131				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1132				alu.src[0].sel = ctx->src[i].sel;
1133				alu.src[0].chan = k;
1134				alu.src[0].rel = ctx->src[i].rel;
1135				alu.dst.sel = treg;
1136				alu.dst.chan = k;
1137				alu.dst.write = 1;
1138				if (k == 3)
1139					alu.last = 1;
1140				r = r600_bytecode_add_alu(ctx->bc, &alu);
1141				if (r)
1142					return r;
1143			}
1144			ctx->src[i].sel = treg;
1145			ctx->src[i].rel =0;
1146			j--;
1147		}
1148	}
1149	return 0;
1150}
1151
1152/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1153static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1154{
1155	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1156	struct r600_bytecode_alu alu;
1157	int i, j, k, nliteral, r;
1158
1159	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1160		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1161			nliteral++;
1162		}
1163	}
1164	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1165		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1166			int treg = r600_get_temp(ctx);
1167			for (k = 0; k < 4; k++) {
1168				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1169				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1170				alu.src[0].sel = ctx->src[i].sel;
1171				alu.src[0].chan = k;
1172				alu.src[0].value = ctx->src[i].value[k];
1173				alu.dst.sel = treg;
1174				alu.dst.chan = k;
1175				alu.dst.write = 1;
1176				if (k == 3)
1177					alu.last = 1;
1178				r = r600_bytecode_add_alu(ctx->bc, &alu);
1179				if (r)
1180					return r;
1181			}
1182			ctx->src[i].sel = treg;
1183			j--;
1184		}
1185	}
1186	return 0;
1187}
1188
1189static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1190{
1191	int i, r, count = ctx->shader->ninput;
1192
1193	for (i = 0; i < count; i++) {
1194		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1195			unsigned back_facing_reg = ctx->shader->input[i].potential_back_facing_reg;
1196			if (ctx->bc->chip_class >= EVERGREEN) {
1197				if ((r = evergreen_interp_input(ctx, back_facing_reg)))
1198					return r;
1199			}
1200
1201			if (!ctx->use_llvm) {
1202				r = select_twoside_color(ctx, i, back_facing_reg);
1203				if (r)
1204					return r;
1205			}
1206		}
1207	}
1208	return 0;
1209}
1210
1211static int r600_shader_from_tgsi(struct r600_screen *rscreen,
1212				 struct r600_pipe_shader *pipeshader,
1213				 struct r600_shader_key key)
1214{
1215	struct r600_shader *shader = &pipeshader->shader;
1216	struct tgsi_token *tokens = pipeshader->selector->tokens;
1217	struct pipe_stream_output_info so = pipeshader->selector->so;
1218	struct tgsi_full_immediate *immediate;
1219	struct tgsi_full_property *property;
1220	struct r600_shader_ctx ctx;
1221	struct r600_bytecode_output output[32];
1222	unsigned output_done, noutput;
1223	unsigned opcode;
1224	int i, j, k, r = 0;
1225	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1226	/* Declarations used by llvm code */
1227	bool use_llvm = false;
1228	unsigned char * inst_bytes = NULL;
1229	unsigned inst_byte_count = 0;
1230
1231#ifdef R600_USE_LLVM
1232	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1233#endif
1234	ctx.bc = &shader->bc;
1235	ctx.shader = shader;
1236	ctx.native_integers = true;
1237
1238	r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family,
1239			   rscreen->msaa_texture_support);
1240	ctx.tokens = tokens;
1241	tgsi_scan_shader(tokens, &ctx.info);
1242	tgsi_parse_init(&ctx.parse, tokens);
1243	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1244	shader->processor_type = ctx.type;
1245	ctx.bc->type = shader->processor_type;
1246
1247	ctx.face_gpr = -1;
1248	ctx.fragcoord_input = -1;
1249	ctx.colors_used = 0;
1250	ctx.clip_vertex_write = 0;
1251
1252	shader->nr_ps_color_exports = 0;
1253	shader->nr_ps_max_color_exports = 0;
1254
1255	shader->two_side = key.color_two_side;
1256
1257	/* register allocations */
1258	/* Values [0,127] correspond to GPR[0..127].
1259	 * Values [128,159] correspond to constant buffer bank 0
1260	 * Values [160,191] correspond to constant buffer bank 1
1261	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1262	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1263	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1264	 * Other special values are shown in the list below.
1265	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1266	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1267	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1268	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1269	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1270	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1271	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1272	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1273	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1274	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1275	 * 254	SQ_ALU_SRC_PV: previous vector result.
1276	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1277	 */
1278	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1279		ctx.file_offset[i] = 0;
1280	}
1281	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1282		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1283		if (ctx.bc->chip_class >= EVERGREEN) {
1284			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1285		} else {
1286			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1287		}
1288	}
1289	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1290		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1291	}
1292
1293#ifdef R600_USE_LLVM
1294	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1295		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1296				"indirect adressing.  Falling back to TGSI "
1297				"backend.\n");
1298		use_llvm = 0;
1299	}
1300#endif
1301	ctx.use_llvm = use_llvm;
1302
1303	if (use_llvm) {
1304		ctx.file_offset[TGSI_FILE_OUTPUT] =
1305			ctx.file_offset[TGSI_FILE_INPUT];
1306	} else {
1307	   ctx.file_offset[TGSI_FILE_OUTPUT] =
1308			ctx.file_offset[TGSI_FILE_INPUT] +
1309			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1310	}
1311	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1312						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1313
1314	/* Outside the GPR range. This will be translated to one of the
1315	 * kcache banks later. */
1316	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1317
1318	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1319	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1320			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1321	ctx.temp_reg = ctx.bc->ar_reg + 1;
1322
1323	ctx.nliterals = 0;
1324	ctx.literals = NULL;
1325	shader->fs_write_all = FALSE;
1326	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1327		tgsi_parse_token(&ctx.parse);
1328		switch (ctx.parse.FullToken.Token.Type) {
1329		case TGSI_TOKEN_TYPE_IMMEDIATE:
1330			immediate = &ctx.parse.FullToken.FullImmediate;
1331			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1332			if(ctx.literals == NULL) {
1333				r = -ENOMEM;
1334				goto out_err;
1335			}
1336			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1337			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1338			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1339			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1340			ctx.nliterals++;
1341			break;
1342		case TGSI_TOKEN_TYPE_DECLARATION:
1343			r = tgsi_declaration(&ctx);
1344			if (r)
1345				goto out_err;
1346			break;
1347		case TGSI_TOKEN_TYPE_INSTRUCTION:
1348			break;
1349		case TGSI_TOKEN_TYPE_PROPERTY:
1350			property = &ctx.parse.FullToken.FullProperty;
1351			switch (property->Property.PropertyName) {
1352			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1353				if (property->u[0].Data == 1)
1354					shader->fs_write_all = TRUE;
1355				break;
1356			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1357				/* we don't need this one */
1358				break;
1359			}
1360			break;
1361		default:
1362			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1363			r = -EINVAL;
1364			goto out_err;
1365		}
1366	}
1367
1368	/* Process two side if needed */
1369	if (shader->two_side && ctx.colors_used) {
1370		int i, count = ctx.shader->ninput;
1371		unsigned next_lds_loc = ctx.shader->nlds;
1372
1373		/* additional inputs will be allocated right after the existing inputs,
1374		 * we won't need them after the color selection, so we don't need to
1375		 * reserve these gprs for the rest of the shader code and to adjust
1376		 * output offsets etc. */
1377		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1378				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1379
1380		if (ctx.face_gpr == -1) {
1381			i = ctx.shader->ninput++;
1382			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1383			ctx.shader->input[i].spi_sid = 0;
1384			ctx.shader->input[i].gpr = gpr++;
1385			ctx.face_gpr = ctx.shader->input[i].gpr;
1386		}
1387
1388		for (i = 0; i < count; i++) {
1389			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1390				int ni = ctx.shader->ninput++;
1391				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1392				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1393				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1394				ctx.shader->input[ni].gpr = gpr++;
1395				// TGSI to LLVM needs to know the lds position of inputs.
1396				// Non LLVM path computes it later (in process_twoside_color)
1397				ctx.shader->input[ni].lds_pos = next_lds_loc++;
1398				ctx.shader->input[i].potential_back_facing_reg = ni;
1399			}
1400		}
1401	}
1402
1403/* LLVM backend setup */
1404#ifdef R600_USE_LLVM
1405	if (use_llvm) {
1406		struct radeon_llvm_context radeon_llvm_ctx;
1407		LLVMModuleRef mod;
1408		unsigned dump = 0;
1409		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1410		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1411		radeon_llvm_ctx.type = ctx.type;
1412		radeon_llvm_ctx.two_side = shader->two_side;
1413		radeon_llvm_ctx.face_input = ctx.face_gpr;
1414		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1415		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
1416		radeon_llvm_ctx.color_buffer_count = MAX2(key.nr_cbufs , 1);
1417		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1418		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->chip_class >= EVERGREEN);
1419		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1420		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1421			dump = 1;
1422		}
1423		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1424							rscreen->family, dump)) {
1425			FREE(inst_bytes);
1426			radeon_llvm_dispose(&radeon_llvm_ctx);
1427			use_llvm = 0;
1428			fprintf(stderr, "R600 LLVM backend failed to compile "
1429				"shader.  Falling back to TGSI\n");
1430		} else {
1431			ctx.file_offset[TGSI_FILE_OUTPUT] =
1432					ctx.file_offset[TGSI_FILE_INPUT];
1433		}
1434		radeon_llvm_dispose(&radeon_llvm_ctx);
1435	}
1436#endif
1437/* End of LLVM backend setup */
1438
1439	if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN)
1440		shader->nr_ps_max_color_exports = 8;
1441
1442	if (ctx.fragcoord_input >= 0 && !use_llvm) {
1443		if (ctx.bc->chip_class == CAYMAN) {
1444			for (j = 0 ; j < 4; j++) {
1445				struct r600_bytecode_alu alu;
1446				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1447				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1448				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1449				alu.src[0].chan = 3;
1450
1451				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1452				alu.dst.chan = j;
1453				alu.dst.write = (j == 3);
1454				alu.last = 1;
1455				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1456					return r;
1457			}
1458		} else {
1459			struct r600_bytecode_alu alu;
1460			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1461			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1462			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1463			alu.src[0].chan = 3;
1464
1465			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1466			alu.dst.chan = 3;
1467			alu.dst.write = 1;
1468			alu.last = 1;
1469			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1470				return r;
1471		}
1472	}
1473
1474	if (shader->two_side && ctx.colors_used) {
1475		if ((r = process_twoside_color_inputs(&ctx)))
1476			return r;
1477	}
1478
1479	tgsi_parse_init(&ctx.parse, tokens);
1480	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1481		tgsi_parse_token(&ctx.parse);
1482		switch (ctx.parse.FullToken.Token.Type) {
1483		case TGSI_TOKEN_TYPE_INSTRUCTION:
1484			if (use_llvm) {
1485				continue;
1486			}
1487			r = tgsi_is_supported(&ctx);
1488			if (r)
1489				goto out_err;
1490			ctx.max_driver_temp_used = 0;
1491			/* reserve first tmp for everyone */
1492			r600_get_temp(&ctx);
1493
1494			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1495			if ((r = tgsi_split_constant(&ctx)))
1496				goto out_err;
1497			if ((r = tgsi_split_literal_constant(&ctx)))
1498				goto out_err;
1499			if (ctx.bc->chip_class == CAYMAN)
1500				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1501			else if (ctx.bc->chip_class >= EVERGREEN)
1502				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1503			else
1504				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1505			r = ctx.inst_info->process(&ctx);
1506			if (r)
1507				goto out_err;
1508			break;
1509		default:
1510			break;
1511		}
1512	}
1513
1514	/* Reset the temporary register counter. */
1515	ctx.max_driver_temp_used = 0;
1516
1517	/* Get instructions if we are using the LLVM backend. */
1518	if (use_llvm) {
1519		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1520		FREE(inst_bytes);
1521	}
1522
1523	noutput = shader->noutput;
1524
1525	if (ctx.clip_vertex_write) {
1526		unsigned clipdist_temp[2];
1527
1528		clipdist_temp[0] = r600_get_temp(&ctx);
1529		clipdist_temp[1] = r600_get_temp(&ctx);
1530
1531		/* need to convert a clipvertex write into clipdistance writes and not export
1532		   the clip vertex anymore */
1533
1534		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1535		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1536		shader->output[noutput].gpr = clipdist_temp[0];
1537		noutput++;
1538		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1539		shader->output[noutput].gpr = clipdist_temp[1];
1540		noutput++;
1541
1542		/* reset spi_sid for clipvertex output to avoid confusing spi */
1543		shader->output[ctx.cv_output].spi_sid = 0;
1544
1545		shader->clip_dist_write = 0xFF;
1546
1547		for (i = 0; i < 8; i++) {
1548			int oreg = i >> 2;
1549			int ochan = i & 3;
1550
1551			for (j = 0; j < 4; j++) {
1552				struct r600_bytecode_alu alu;
1553				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1554				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1555				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1556				alu.src[0].chan = j;
1557
1558				alu.src[1].sel = 512 + i;
1559				alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
1560				alu.src[1].chan = j;
1561
1562				alu.dst.sel = clipdist_temp[oreg];
1563				alu.dst.chan = j;
1564				alu.dst.write = (j == ochan);
1565				if (j == 3)
1566					alu.last = 1;
1567				r = r600_bytecode_add_alu(ctx.bc, &alu);
1568				if (r)
1569					return r;
1570			}
1571		}
1572	}
1573
1574	/* Add stream outputs. */
1575	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1576		unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1577
1578		/* Sanity checking. */
1579		if (so.num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1580			R600_ERR("Too many stream outputs: %d\n", so.num_outputs);
1581			r = -EINVAL;
1582			goto out_err;
1583		}
1584		for (i = 0; i < so.num_outputs; i++) {
1585			if (so.output[i].output_buffer >= 4) {
1586				R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1587					 so.output[i].output_buffer);
1588				r = -EINVAL;
1589				goto out_err;
1590			}
1591		}
1592
1593		/* Initialize locations where the outputs are stored. */
1594		for (i = 0; i < so.num_outputs; i++) {
1595			so_gpr[i] = shader->output[so.output[i].register_index].gpr;
1596
1597			/* Lower outputs with dst_offset < start_component.
1598			 *
1599			 * We can only output 4D vectors with a write mask, e.g. we can
1600			 * only output the W component at offset 3, etc. If we want
1601			 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1602			 * to move it to X and output X. */
1603			if (so.output[i].dst_offset < so.output[i].start_component) {
1604				unsigned tmp = r600_get_temp(&ctx);
1605
1606				for (j = 0; j < so.output[i].num_components; j++) {
1607					struct r600_bytecode_alu alu;
1608					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1609					alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1610					alu.src[0].sel = so_gpr[i];
1611					alu.src[0].chan = so.output[i].start_component + j;
1612
1613					alu.dst.sel = tmp;
1614					alu.dst.chan = j;
1615					alu.dst.write = 1;
1616					if (j == so.output[i].num_components - 1)
1617						alu.last = 1;
1618					r = r600_bytecode_add_alu(ctx.bc, &alu);
1619					if (r)
1620						return r;
1621				}
1622				so.output[i].start_component = 0;
1623				so_gpr[i] = tmp;
1624			}
1625		}
1626
1627		/* Write outputs to buffers. */
1628		for (i = 0; i < so.num_outputs; i++) {
1629			struct r600_bytecode_output output;
1630
1631			memset(&output, 0, sizeof(struct r600_bytecode_output));
1632			output.gpr = so_gpr[i];
1633			output.elem_size = so.output[i].num_components;
1634			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1635			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1636			output.burst_count = 1;
1637			output.barrier = 1;
1638			/* array_size is an upper limit for the burst_count
1639			 * with MEM_STREAM instructions */
1640			output.array_size = 0xFFF;
1641			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1642			if (ctx.bc->chip_class >= EVERGREEN) {
1643				switch (so.output[i].output_buffer) {
1644				case 0:
1645					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1646					break;
1647				case 1:
1648					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1649					break;
1650				case 2:
1651					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1652					break;
1653				case 3:
1654					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1655					break;
1656				}
1657			} else {
1658				switch (so.output[i].output_buffer) {
1659				case 0:
1660					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1661					break;
1662				case 1:
1663					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1664					break;
1665				case 2:
1666					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1667					break;
1668				case 3:
1669					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1670					break;
1671				}
1672			}
1673			r = r600_bytecode_add_output(ctx.bc, &output);
1674			if (r)
1675				goto out_err;
1676		}
1677	}
1678
1679	/* export output */
1680	for (i = 0, j = 0; i < noutput; i++, j++) {
1681		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1682		output[j].gpr = shader->output[i].gpr;
1683		output[j].elem_size = 3;
1684		output[j].swizzle_x = 0;
1685		output[j].swizzle_y = 1;
1686		output[j].swizzle_z = 2;
1687		output[j].swizzle_w = 3;
1688		output[j].burst_count = 1;
1689		output[j].barrier = 1;
1690		output[j].type = -1;
1691		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1692		switch (ctx.type) {
1693		case TGSI_PROCESSOR_VERTEX:
1694			switch (shader->output[i].name) {
1695			case TGSI_SEMANTIC_POSITION:
1696				output[j].array_base = next_pos_base++;
1697				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1698				break;
1699
1700			case TGSI_SEMANTIC_PSIZE:
1701				output[j].array_base = next_pos_base++;
1702				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1703				break;
1704			case TGSI_SEMANTIC_CLIPVERTEX:
1705				j--;
1706				break;
1707			case TGSI_SEMANTIC_CLIPDIST:
1708				output[j].array_base = next_pos_base++;
1709				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1710				/* spi_sid is 0 for clipdistance outputs that were generated
1711				 * for clipvertex - we don't need to pass them to PS */
1712				if (shader->output[i].spi_sid) {
1713					j++;
1714					/* duplicate it as PARAM to pass to the pixel shader */
1715					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1716					output[j].array_base = next_param_base++;
1717					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1718				}
1719				break;
1720			case TGSI_SEMANTIC_FOG:
1721				output[j].swizzle_y = 4; /* 0 */
1722				output[j].swizzle_z = 4; /* 0 */
1723				output[j].swizzle_w = 5; /* 1 */
1724				break;
1725			}
1726			break;
1727		case TGSI_PROCESSOR_FRAGMENT:
1728			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1729				/* never export more colors than the number of CBs */
1730				if (next_pixel_base && next_pixel_base >= key.nr_cbufs) {
1731					/* skip export */
1732					j--;
1733					continue;
1734				}
1735				output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1736				output[j].array_base = next_pixel_base++;
1737				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1738				shader->nr_ps_color_exports++;
1739				if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
1740					for (k = 1; k < key.nr_cbufs; k++) {
1741						j++;
1742						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1743						output[j].gpr = shader->output[i].gpr;
1744						output[j].elem_size = 3;
1745						output[j].swizzle_x = 0;
1746						output[j].swizzle_y = 1;
1747						output[j].swizzle_z = 2;
1748						output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1749						output[j].burst_count = 1;
1750						output[j].barrier = 1;
1751						output[j].array_base = next_pixel_base++;
1752						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1753						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1754						shader->nr_ps_color_exports++;
1755					}
1756				}
1757			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1758				output[j].array_base = 61;
1759				output[j].swizzle_x = 2;
1760				output[j].swizzle_y = 7;
1761				output[j].swizzle_z = output[j].swizzle_w = 7;
1762				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1763			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1764				output[j].array_base = 61;
1765				output[j].swizzle_x = 7;
1766				output[j].swizzle_y = 1;
1767				output[j].swizzle_z = output[j].swizzle_w = 7;
1768				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1769			} else {
1770				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1771				r = -EINVAL;
1772				goto out_err;
1773			}
1774			break;
1775		default:
1776			R600_ERR("unsupported processor type %d\n", ctx.type);
1777			r = -EINVAL;
1778			goto out_err;
1779		}
1780
1781		if (output[j].type==-1) {
1782			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1783			output[j].array_base = next_param_base++;
1784		}
1785	}
1786
1787        /* add fake position export */
1788	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_pos_base == 60) {
1789			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1790			output[j].gpr = 0;
1791			output[j].elem_size = 3;
1792			output[j].swizzle_x = 7;
1793			output[j].swizzle_y = 7;
1794			output[j].swizzle_z = 7;
1795			output[j].swizzle_w = 7;
1796			output[j].burst_count = 1;
1797			output[j].barrier = 1;
1798			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1799			output[j].array_base = next_pos_base;
1800			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1801			j++;
1802	}
1803
1804	/* add fake param output for vertex shader if no param is exported */
1805	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1806			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1807			output[j].gpr = 0;
1808			output[j].elem_size = 3;
1809			output[j].swizzle_x = 7;
1810			output[j].swizzle_y = 7;
1811			output[j].swizzle_z = 7;
1812			output[j].swizzle_w = 7;
1813			output[j].burst_count = 1;
1814			output[j].barrier = 1;
1815			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1816			output[j].array_base = 0;
1817			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1818			j++;
1819	}
1820
1821	/* add fake pixel export */
1822	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1823		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1824		output[j].gpr = 0;
1825		output[j].elem_size = 3;
1826		output[j].swizzle_x = 7;
1827		output[j].swizzle_y = 7;
1828		output[j].swizzle_z = 7;
1829		output[j].swizzle_w = 7;
1830		output[j].burst_count = 1;
1831		output[j].barrier = 1;
1832		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1833		output[j].array_base = 0;
1834		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1835		j++;
1836	}
1837
1838	noutput = j;
1839
1840	/* set export done on last export of each type */
1841	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1842		if (ctx.bc->chip_class < CAYMAN) {
1843			if (i == (noutput - 1)) {
1844				output[i].end_of_program = 1;
1845			}
1846		}
1847		if (!(output_done & (1 << output[i].type))) {
1848			output_done |= (1 << output[i].type);
1849			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1850		}
1851	}
1852	/* add output to bytecode */
1853	if (!use_llvm || ctx.type != TGSI_PROCESSOR_FRAGMENT) {
1854		for (i = 0; i < noutput; i++) {
1855			r = r600_bytecode_add_output(ctx.bc, &output[i]);
1856			if (r)
1857				goto out_err;
1858		}
1859	}
1860	/* add program end */
1861	if (ctx.bc->chip_class == CAYMAN)
1862		cm_bytecode_add_cf_end(ctx.bc);
1863
1864	/* check GPR limit - we have 124 = 128 - 4
1865	 * (4 are reserved as alu clause temporary registers) */
1866	if (ctx.bc->ngpr > 124) {
1867		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1868		r = -ENOMEM;
1869		goto out_err;
1870	}
1871
1872	free(ctx.literals);
1873	tgsi_parse_free(&ctx.parse);
1874	return 0;
1875out_err:
1876	free(ctx.literals);
1877	tgsi_parse_free(&ctx.parse);
1878	return r;
1879}
1880
1881static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1882{
1883	R600_ERR("%s tgsi opcode unsupported\n",
1884		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1885	return -EINVAL;
1886}
1887
1888static int tgsi_end(struct r600_shader_ctx *ctx)
1889{
1890	return 0;
1891}
1892
1893static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1894			const struct r600_shader_src *shader_src,
1895			unsigned chan)
1896{
1897	bc_src->sel = shader_src->sel;
1898	bc_src->chan = shader_src->swizzle[chan];
1899	bc_src->neg = shader_src->neg;
1900	bc_src->abs = shader_src->abs;
1901	bc_src->rel = shader_src->rel;
1902	bc_src->value = shader_src->value[bc_src->chan];
1903	bc_src->kc_bank = shader_src->kc_bank;
1904}
1905
1906static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1907{
1908	bc_src->abs = 1;
1909	bc_src->neg = 0;
1910}
1911
1912static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1913{
1914	bc_src->neg = !bc_src->neg;
1915}
1916
1917static void tgsi_dst(struct r600_shader_ctx *ctx,
1918		     const struct tgsi_full_dst_register *tgsi_dst,
1919		     unsigned swizzle,
1920		     struct r600_bytecode_alu_dst *r600_dst)
1921{
1922	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1923
1924	r600_dst->sel = tgsi_dst->Register.Index;
1925	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1926	r600_dst->chan = swizzle;
1927	r600_dst->write = 1;
1928	if (tgsi_dst->Register.Indirect)
1929		r600_dst->rel = V_SQ_REL_RELATIVE;
1930	if (inst->Instruction.Saturate) {
1931		r600_dst->clamp = 1;
1932	}
1933}
1934
1935static int tgsi_last_instruction(unsigned writemask)
1936{
1937	int i, lasti = 0;
1938
1939	for (i = 0; i < 4; i++) {
1940		if (writemask & (1 << i)) {
1941			lasti = i;
1942		}
1943	}
1944	return lasti;
1945}
1946
1947static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1948{
1949	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1950	struct r600_bytecode_alu alu;
1951	int i, j, r;
1952	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1953
1954	for (i = 0; i < lasti + 1; i++) {
1955		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1956			continue;
1957
1958		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1959		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1960
1961		alu.inst = ctx->inst_info->r600_opcode;
1962		if (!swap) {
1963			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1964				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1965			}
1966		} else {
1967			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1968			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1969		}
1970		/* handle some special cases */
1971		switch (ctx->inst_info->tgsi_opcode) {
1972		case TGSI_OPCODE_SUB:
1973			r600_bytecode_src_toggle_neg(&alu.src[1]);
1974			break;
1975		case TGSI_OPCODE_ABS:
1976			r600_bytecode_src_set_abs(&alu.src[0]);
1977			break;
1978		default:
1979			break;
1980		}
1981		if (i == lasti || trans_only) {
1982			alu.last = 1;
1983		}
1984		r = r600_bytecode_add_alu(ctx->bc, &alu);
1985		if (r)
1986			return r;
1987	}
1988	return 0;
1989}
1990
1991static int tgsi_op2(struct r600_shader_ctx *ctx)
1992{
1993	return tgsi_op2_s(ctx, 0, 0);
1994}
1995
1996static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1997{
1998	return tgsi_op2_s(ctx, 1, 0);
1999}
2000
2001static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2002{
2003	return tgsi_op2_s(ctx, 0, 1);
2004}
2005
2006static int tgsi_ineg(struct r600_shader_ctx *ctx)
2007{
2008	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2009	struct r600_bytecode_alu alu;
2010	int i, r;
2011	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2012
2013	for (i = 0; i < lasti + 1; i++) {
2014
2015		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2016			continue;
2017		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2018		alu.inst = ctx->inst_info->r600_opcode;
2019
2020		alu.src[0].sel = V_SQ_ALU_SRC_0;
2021
2022		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2023
2024		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2025
2026		if (i == lasti) {
2027			alu.last = 1;
2028		}
2029		r = r600_bytecode_add_alu(ctx->bc, &alu);
2030		if (r)
2031			return r;
2032	}
2033	return 0;
2034
2035}
2036
2037static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2038{
2039	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2040	int i, j, r;
2041	struct r600_bytecode_alu alu;
2042	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2043
2044	for (i = 0 ; i < last_slot; i++) {
2045		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2046		alu.inst = ctx->inst_info->r600_opcode;
2047		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2048			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2049
2050			/* RSQ should take the absolute value of src */
2051			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
2052				r600_bytecode_src_set_abs(&alu.src[j]);
2053			}
2054		}
2055		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2056		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2057
2058		if (i == last_slot - 1)
2059			alu.last = 1;
2060		r = r600_bytecode_add_alu(ctx->bc, &alu);
2061		if (r)
2062			return r;
2063	}
2064	return 0;
2065}
2066
2067static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2068{
2069	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2070	int i, j, k, r;
2071	struct r600_bytecode_alu alu;
2072	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2073	for (k = 0; k < last_slot; k++) {
2074		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2075			continue;
2076
2077		for (i = 0 ; i < 4; i++) {
2078			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2079			alu.inst = ctx->inst_info->r600_opcode;
2080			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2081				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2082			}
2083			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2084			alu.dst.write = (i == k);
2085			if (i == 3)
2086				alu.last = 1;
2087			r = r600_bytecode_add_alu(ctx->bc, &alu);
2088			if (r)
2089				return r;
2090		}
2091	}
2092	return 0;
2093}
2094
2095/*
2096 * r600 - trunc to -PI..PI range
2097 * r700 - normalize by dividing by 2PI
2098 * see fdo bug 27901
2099 */
2100static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2101{
2102	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2103	static float double_pi = 3.1415926535 * 2;
2104	static float neg_pi = -3.1415926535;
2105
2106	int r;
2107	struct r600_bytecode_alu alu;
2108
2109	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2110	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2111	alu.is_op3 = 1;
2112
2113	alu.dst.chan = 0;
2114	alu.dst.sel = ctx->temp_reg;
2115	alu.dst.write = 1;
2116
2117	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2118
2119	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2120	alu.src[1].chan = 0;
2121	alu.src[1].value = *(uint32_t *)&half_inv_pi;
2122	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2123	alu.src[2].chan = 0;
2124	alu.last = 1;
2125	r = r600_bytecode_add_alu(ctx->bc, &alu);
2126	if (r)
2127		return r;
2128
2129	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2130	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
2131
2132	alu.dst.chan = 0;
2133	alu.dst.sel = ctx->temp_reg;
2134	alu.dst.write = 1;
2135
2136	alu.src[0].sel = ctx->temp_reg;
2137	alu.src[0].chan = 0;
2138	alu.last = 1;
2139	r = r600_bytecode_add_alu(ctx->bc, &alu);
2140	if (r)
2141		return r;
2142
2143	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2144	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2145	alu.is_op3 = 1;
2146
2147	alu.dst.chan = 0;
2148	alu.dst.sel = ctx->temp_reg;
2149	alu.dst.write = 1;
2150
2151	alu.src[0].sel = ctx->temp_reg;
2152	alu.src[0].chan = 0;
2153
2154	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2155	alu.src[1].chan = 0;
2156	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2157	alu.src[2].chan = 0;
2158
2159	if (ctx->bc->chip_class == R600) {
2160		alu.src[1].value = *(uint32_t *)&double_pi;
2161		alu.src[2].value = *(uint32_t *)&neg_pi;
2162	} else {
2163		alu.src[1].sel = V_SQ_ALU_SRC_1;
2164		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2165		alu.src[2].neg = 1;
2166	}
2167
2168	alu.last = 1;
2169	r = r600_bytecode_add_alu(ctx->bc, &alu);
2170	if (r)
2171		return r;
2172	return 0;
2173}
2174
2175static int cayman_trig(struct r600_shader_ctx *ctx)
2176{
2177	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2178	struct r600_bytecode_alu alu;
2179	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2180	int i, r;
2181
2182	r = tgsi_setup_trig(ctx);
2183	if (r)
2184		return r;
2185
2186
2187	for (i = 0; i < last_slot; i++) {
2188		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2189		alu.inst = ctx->inst_info->r600_opcode;
2190		alu.dst.chan = i;
2191
2192		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2193		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2194
2195		alu.src[0].sel = ctx->temp_reg;
2196		alu.src[0].chan = 0;
2197		if (i == last_slot - 1)
2198			alu.last = 1;
2199		r = r600_bytecode_add_alu(ctx->bc, &alu);
2200		if (r)
2201			return r;
2202	}
2203	return 0;
2204}
2205
2206static int tgsi_trig(struct r600_shader_ctx *ctx)
2207{
2208	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2209	struct r600_bytecode_alu alu;
2210	int i, r;
2211	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2212
2213	r = tgsi_setup_trig(ctx);
2214	if (r)
2215		return r;
2216
2217	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2218	alu.inst = ctx->inst_info->r600_opcode;
2219	alu.dst.chan = 0;
2220	alu.dst.sel = ctx->temp_reg;
2221	alu.dst.write = 1;
2222
2223	alu.src[0].sel = ctx->temp_reg;
2224	alu.src[0].chan = 0;
2225	alu.last = 1;
2226	r = r600_bytecode_add_alu(ctx->bc, &alu);
2227	if (r)
2228		return r;
2229
2230	/* replicate result */
2231	for (i = 0; i < lasti + 1; i++) {
2232		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2233			continue;
2234
2235		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2236		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2237
2238		alu.src[0].sel = ctx->temp_reg;
2239		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2240		if (i == lasti)
2241			alu.last = 1;
2242		r = r600_bytecode_add_alu(ctx->bc, &alu);
2243		if (r)
2244			return r;
2245	}
2246	return 0;
2247}
2248
2249static int tgsi_scs(struct r600_shader_ctx *ctx)
2250{
2251	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2252	struct r600_bytecode_alu alu;
2253	int i, r;
2254
2255	/* We'll only need the trig stuff if we are going to write to the
2256	 * X or Y components of the destination vector.
2257	 */
2258	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2259		r = tgsi_setup_trig(ctx);
2260		if (r)
2261			return r;
2262	}
2263
2264	/* dst.x = COS */
2265	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2266		if (ctx->bc->chip_class == CAYMAN) {
2267			for (i = 0 ; i < 3; i++) {
2268				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2269				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2270				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2271
2272				if (i == 0)
2273					alu.dst.write = 1;
2274				else
2275					alu.dst.write = 0;
2276				alu.src[0].sel = ctx->temp_reg;
2277				alu.src[0].chan = 0;
2278				if (i == 2)
2279					alu.last = 1;
2280				r = r600_bytecode_add_alu(ctx->bc, &alu);
2281				if (r)
2282					return r;
2283			}
2284		} else {
2285			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2286			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2287			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2288
2289			alu.src[0].sel = ctx->temp_reg;
2290			alu.src[0].chan = 0;
2291			alu.last = 1;
2292			r = r600_bytecode_add_alu(ctx->bc, &alu);
2293			if (r)
2294				return r;
2295		}
2296	}
2297
2298	/* dst.y = SIN */
2299	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2300		if (ctx->bc->chip_class == CAYMAN) {
2301			for (i = 0 ; i < 3; i++) {
2302				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2303				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2304				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2305				if (i == 1)
2306					alu.dst.write = 1;
2307				else
2308					alu.dst.write = 0;
2309				alu.src[0].sel = ctx->temp_reg;
2310				alu.src[0].chan = 0;
2311				if (i == 2)
2312					alu.last = 1;
2313				r = r600_bytecode_add_alu(ctx->bc, &alu);
2314				if (r)
2315					return r;
2316			}
2317		} else {
2318			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2319			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2320			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2321
2322			alu.src[0].sel = ctx->temp_reg;
2323			alu.src[0].chan = 0;
2324			alu.last = 1;
2325			r = r600_bytecode_add_alu(ctx->bc, &alu);
2326			if (r)
2327				return r;
2328		}
2329	}
2330
2331	/* dst.z = 0.0; */
2332	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2333		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2334
2335		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2336
2337		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2338
2339		alu.src[0].sel = V_SQ_ALU_SRC_0;
2340		alu.src[0].chan = 0;
2341
2342		alu.last = 1;
2343
2344		r = r600_bytecode_add_alu(ctx->bc, &alu);
2345		if (r)
2346			return r;
2347	}
2348
2349	/* dst.w = 1.0; */
2350	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2351		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2352
2353		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2354
2355		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2356
2357		alu.src[0].sel = V_SQ_ALU_SRC_1;
2358		alu.src[0].chan = 0;
2359
2360		alu.last = 1;
2361
2362		r = r600_bytecode_add_alu(ctx->bc, &alu);
2363		if (r)
2364			return r;
2365	}
2366
2367	return 0;
2368}
2369
2370static int tgsi_kill(struct r600_shader_ctx *ctx)
2371{
2372	struct r600_bytecode_alu alu;
2373	int i, r;
2374
2375	for (i = 0; i < 4; i++) {
2376		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2377		alu.inst = ctx->inst_info->r600_opcode;
2378
2379		alu.dst.chan = i;
2380
2381		alu.src[0].sel = V_SQ_ALU_SRC_0;
2382
2383		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2384			alu.src[1].sel = V_SQ_ALU_SRC_1;
2385			alu.src[1].neg = 1;
2386		} else {
2387			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2388		}
2389		if (i == 3) {
2390			alu.last = 1;
2391		}
2392		r = r600_bytecode_add_alu(ctx->bc, &alu);
2393		if (r)
2394			return r;
2395	}
2396
2397	/* kill must be last in ALU */
2398	ctx->bc->force_add_cf = 1;
2399	ctx->shader->uses_kill = TRUE;
2400	return 0;
2401}
2402
2403static int tgsi_lit(struct r600_shader_ctx *ctx)
2404{
2405	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2406	struct r600_bytecode_alu alu;
2407	int r;
2408
2409	/* tmp.x = max(src.y, 0.0) */
2410	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2411	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2412	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2413	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2414	alu.src[1].chan = 1;
2415
2416	alu.dst.sel = ctx->temp_reg;
2417	alu.dst.chan = 0;
2418	alu.dst.write = 1;
2419
2420	alu.last = 1;
2421	r = r600_bytecode_add_alu(ctx->bc, &alu);
2422	if (r)
2423		return r;
2424
2425	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2426	{
2427		int chan;
2428		int sel;
2429		int i;
2430
2431		if (ctx->bc->chip_class == CAYMAN) {
2432			for (i = 0; i < 3; i++) {
2433				/* tmp.z = log(tmp.x) */
2434				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2435				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2436				alu.src[0].sel = ctx->temp_reg;
2437				alu.src[0].chan = 0;
2438				alu.dst.sel = ctx->temp_reg;
2439				alu.dst.chan = i;
2440				if (i == 2) {
2441					alu.dst.write = 1;
2442					alu.last = 1;
2443				} else
2444					alu.dst.write = 0;
2445
2446				r = r600_bytecode_add_alu(ctx->bc, &alu);
2447				if (r)
2448					return r;
2449			}
2450		} else {
2451			/* tmp.z = log(tmp.x) */
2452			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2453			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2454			alu.src[0].sel = ctx->temp_reg;
2455			alu.src[0].chan = 0;
2456			alu.dst.sel = ctx->temp_reg;
2457			alu.dst.chan = 2;
2458			alu.dst.write = 1;
2459			alu.last = 1;
2460			r = r600_bytecode_add_alu(ctx->bc, &alu);
2461			if (r)
2462				return r;
2463		}
2464
2465		chan = alu.dst.chan;
2466		sel = alu.dst.sel;
2467
2468		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2469		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2470		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2471		alu.src[0].sel  = sel;
2472		alu.src[0].chan = chan;
2473		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2474		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2475		alu.dst.sel = ctx->temp_reg;
2476		alu.dst.chan = 0;
2477		alu.dst.write = 1;
2478		alu.is_op3 = 1;
2479		alu.last = 1;
2480		r = r600_bytecode_add_alu(ctx->bc, &alu);
2481		if (r)
2482			return r;
2483
2484		if (ctx->bc->chip_class == CAYMAN) {
2485			for (i = 0; i < 3; i++) {
2486				/* dst.z = exp(tmp.x) */
2487				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2488				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2489				alu.src[0].sel = ctx->temp_reg;
2490				alu.src[0].chan = 0;
2491				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2492				if (i == 2) {
2493					alu.dst.write = 1;
2494					alu.last = 1;
2495				} else
2496					alu.dst.write = 0;
2497				r = r600_bytecode_add_alu(ctx->bc, &alu);
2498				if (r)
2499					return r;
2500			}
2501		} else {
2502			/* dst.z = exp(tmp.x) */
2503			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2504			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2505			alu.src[0].sel = ctx->temp_reg;
2506			alu.src[0].chan = 0;
2507			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2508			alu.last = 1;
2509			r = r600_bytecode_add_alu(ctx->bc, &alu);
2510			if (r)
2511				return r;
2512		}
2513	}
2514
2515	/* dst.x, <- 1.0  */
2516	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2517	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2518	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2519	alu.src[0].chan = 0;
2520	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2521	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2522	r = r600_bytecode_add_alu(ctx->bc, &alu);
2523	if (r)
2524		return r;
2525
2526	/* dst.y = max(src.x, 0.0) */
2527	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2528	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2529	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2530	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2531	alu.src[1].chan = 0;
2532	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2533	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2534	r = r600_bytecode_add_alu(ctx->bc, &alu);
2535	if (r)
2536		return r;
2537
2538	/* dst.w, <- 1.0  */
2539	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2540	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2541	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2542	alu.src[0].chan = 0;
2543	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2544	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2545	alu.last = 1;
2546	r = r600_bytecode_add_alu(ctx->bc, &alu);
2547	if (r)
2548		return r;
2549
2550	return 0;
2551}
2552
2553static int tgsi_rsq(struct r600_shader_ctx *ctx)
2554{
2555	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2556	struct r600_bytecode_alu alu;
2557	int i, r;
2558
2559	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2560
2561	/* XXX:
2562	 * For state trackers other than OpenGL, we'll want to use
2563	 * _RECIPSQRT_IEEE instead.
2564	 */
2565	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2566
2567	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2568		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2569		r600_bytecode_src_set_abs(&alu.src[i]);
2570	}
2571	alu.dst.sel = ctx->temp_reg;
2572	alu.dst.write = 1;
2573	alu.last = 1;
2574	r = r600_bytecode_add_alu(ctx->bc, &alu);
2575	if (r)
2576		return r;
2577	/* replicate result */
2578	return tgsi_helper_tempx_replicate(ctx);
2579}
2580
2581static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2582{
2583	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2584	struct r600_bytecode_alu alu;
2585	int i, r;
2586
2587	for (i = 0; i < 4; i++) {
2588		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2589		alu.src[0].sel = ctx->temp_reg;
2590		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2591		alu.dst.chan = i;
2592		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2593		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2594		if (i == 3)
2595			alu.last = 1;
2596		r = r600_bytecode_add_alu(ctx->bc, &alu);
2597		if (r)
2598			return r;
2599	}
2600	return 0;
2601}
2602
2603static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2604{
2605	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2606	struct r600_bytecode_alu alu;
2607	int i, r;
2608
2609	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2610	alu.inst = ctx->inst_info->r600_opcode;
2611	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2612		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2613	}
2614	alu.dst.sel = ctx->temp_reg;
2615	alu.dst.write = 1;
2616	alu.last = 1;
2617	r = r600_bytecode_add_alu(ctx->bc, &alu);
2618	if (r)
2619		return r;
2620	/* replicate result */
2621	return tgsi_helper_tempx_replicate(ctx);
2622}
2623
2624static int cayman_pow(struct r600_shader_ctx *ctx)
2625{
2626	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2627	int i, r;
2628	struct r600_bytecode_alu alu;
2629	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2630
2631	for (i = 0; i < 3; i++) {
2632		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2633		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2634		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2635		alu.dst.sel = ctx->temp_reg;
2636		alu.dst.chan = i;
2637		alu.dst.write = 1;
2638		if (i == 2)
2639			alu.last = 1;
2640		r = r600_bytecode_add_alu(ctx->bc, &alu);
2641		if (r)
2642			return r;
2643	}
2644
2645	/* b * LOG2(a) */
2646	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2647	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2648	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2649	alu.src[1].sel = ctx->temp_reg;
2650	alu.dst.sel = ctx->temp_reg;
2651	alu.dst.write = 1;
2652	alu.last = 1;
2653	r = r600_bytecode_add_alu(ctx->bc, &alu);
2654	if (r)
2655		return r;
2656
2657	for (i = 0; i < last_slot; i++) {
2658		/* POW(a,b) = EXP2(b * LOG2(a))*/
2659		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2660		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2661		alu.src[0].sel = ctx->temp_reg;
2662
2663		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2664		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2665		if (i == last_slot - 1)
2666			alu.last = 1;
2667		r = r600_bytecode_add_alu(ctx->bc, &alu);
2668		if (r)
2669			return r;
2670	}
2671	return 0;
2672}
2673
2674static int tgsi_pow(struct r600_shader_ctx *ctx)
2675{
2676	struct r600_bytecode_alu alu;
2677	int r;
2678
2679	/* LOG2(a) */
2680	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2681	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2682	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2683	alu.dst.sel = ctx->temp_reg;
2684	alu.dst.write = 1;
2685	alu.last = 1;
2686	r = r600_bytecode_add_alu(ctx->bc, &alu);
2687	if (r)
2688		return r;
2689	/* b * LOG2(a) */
2690	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2691	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2692	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2693	alu.src[1].sel = ctx->temp_reg;
2694	alu.dst.sel = ctx->temp_reg;
2695	alu.dst.write = 1;
2696	alu.last = 1;
2697	r = r600_bytecode_add_alu(ctx->bc, &alu);
2698	if (r)
2699		return r;
2700	/* POW(a,b) = EXP2(b * LOG2(a))*/
2701	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2702	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2703	alu.src[0].sel = ctx->temp_reg;
2704	alu.dst.sel = ctx->temp_reg;
2705	alu.dst.write = 1;
2706	alu.last = 1;
2707	r = r600_bytecode_add_alu(ctx->bc, &alu);
2708	if (r)
2709		return r;
2710	return tgsi_helper_tempx_replicate(ctx);
2711}
2712
2713static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2714{
2715	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2716	struct r600_bytecode_alu alu;
2717	int i, r, j;
2718	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2719	int tmp0 = ctx->temp_reg;
2720	int tmp1 = r600_get_temp(ctx);
2721	int tmp2 = r600_get_temp(ctx);
2722	int tmp3 = r600_get_temp(ctx);
2723	/* Unsigned path:
2724	 *
2725	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2726	 *
2727	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2728	 * 2. tmp0.z = lo (tmp0.x * src2)
2729	 * 3. tmp0.w = -tmp0.z
2730	 * 4. tmp0.y = hi (tmp0.x * src2)
2731	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2732	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2733	 * 7. tmp1.x = tmp0.x - tmp0.w
2734	 * 8. tmp1.y = tmp0.x + tmp0.w
2735	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2736	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2737	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2738	 *
2739	 * 12. tmp0.w = src1 - tmp0.y       = r
2740	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2741	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2742	 *
2743	 * if DIV
2744	 *
2745	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2746	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2747	 *
2748	 * else MOD
2749	 *
2750	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2751	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2752	 *
2753	 * endif
2754	 *
2755	 * 17. tmp1.x = tmp1.x & tmp1.y
2756	 *
2757	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2758	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2759	 *
2760	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2761	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2762	 *
2763	 * Signed path:
2764	 *
2765	 * Same as unsigned, using abs values of the operands,
2766	 * and fixing the sign of the result in the end.
2767	 */
2768
2769	for (i = 0; i < 4; i++) {
2770		if (!(write_mask & (1<<i)))
2771			continue;
2772
2773		if (signed_op) {
2774
2775			/* tmp2.x = -src0 */
2776			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2777			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2778
2779			alu.dst.sel = tmp2;
2780			alu.dst.chan = 0;
2781			alu.dst.write = 1;
2782
2783			alu.src[0].sel = V_SQ_ALU_SRC_0;
2784
2785			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2786
2787			alu.last = 1;
2788			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2789				return r;
2790
2791			/* tmp2.y = -src1 */
2792			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2793			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2794
2795			alu.dst.sel = tmp2;
2796			alu.dst.chan = 1;
2797			alu.dst.write = 1;
2798
2799			alu.src[0].sel = V_SQ_ALU_SRC_0;
2800
2801			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2802
2803			alu.last = 1;
2804			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2805				return r;
2806
2807			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2808			/* it will be a sign of the quotient */
2809			if (!mod) {
2810
2811				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2812				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2813
2814				alu.dst.sel = tmp2;
2815				alu.dst.chan = 2;
2816				alu.dst.write = 1;
2817
2818				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2819				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2820
2821				alu.last = 1;
2822				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2823					return r;
2824			}
2825
2826			/* tmp2.x = |src0| */
2827			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2828			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2829			alu.is_op3 = 1;
2830
2831			alu.dst.sel = tmp2;
2832			alu.dst.chan = 0;
2833			alu.dst.write = 1;
2834
2835			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2836			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2837			alu.src[2].sel = tmp2;
2838			alu.src[2].chan = 0;
2839
2840			alu.last = 1;
2841			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2842				return r;
2843
2844			/* tmp2.y = |src1| */
2845			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2846			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2847			alu.is_op3 = 1;
2848
2849			alu.dst.sel = tmp2;
2850			alu.dst.chan = 1;
2851			alu.dst.write = 1;
2852
2853			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2854			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2855			alu.src[2].sel = tmp2;
2856			alu.src[2].chan = 1;
2857
2858			alu.last = 1;
2859			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2860				return r;
2861
2862		}
2863
2864		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2865		if (ctx->bc->chip_class == CAYMAN) {
2866			/* tmp3.x = u2f(src2) */
2867			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2868			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2869
2870			alu.dst.sel = tmp3;
2871			alu.dst.chan = 0;
2872			alu.dst.write = 1;
2873
2874			if (signed_op) {
2875				alu.src[0].sel = tmp2;
2876				alu.src[0].chan = 1;
2877			} else {
2878				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2879			}
2880
2881			alu.last = 1;
2882			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2883				return r;
2884
2885			/* tmp0.x = recip(tmp3.x) */
2886			for (j = 0 ; j < 3; j++) {
2887				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2888				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2889
2890				alu.dst.sel = tmp0;
2891				alu.dst.chan = j;
2892				alu.dst.write = (j == 0);
2893
2894				alu.src[0].sel = tmp3;
2895				alu.src[0].chan = 0;
2896
2897				if (j == 2)
2898					alu.last = 1;
2899				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2900					return r;
2901			}
2902
2903			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2904			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2905
2906			alu.src[0].sel = tmp0;
2907			alu.src[0].chan = 0;
2908
2909			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2910			alu.src[1].value = 0x4f800000;
2911
2912			alu.dst.sel = tmp3;
2913			alu.dst.write = 1;
2914			alu.last = 1;
2915			r = r600_bytecode_add_alu(ctx->bc, &alu);
2916			if (r)
2917				return r;
2918
2919			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2920			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2921
2922			alu.dst.sel = tmp0;
2923			alu.dst.chan = 0;
2924			alu.dst.write = 1;
2925
2926			alu.src[0].sel = tmp3;
2927			alu.src[0].chan = 0;
2928
2929			alu.last = 1;
2930			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2931				return r;
2932
2933		} else {
2934			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2935			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2936
2937			alu.dst.sel = tmp0;
2938			alu.dst.chan = 0;
2939			alu.dst.write = 1;
2940
2941			if (signed_op) {
2942				alu.src[0].sel = tmp2;
2943				alu.src[0].chan = 1;
2944			} else {
2945				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2946			}
2947
2948			alu.last = 1;
2949			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2950				return r;
2951		}
2952
2953		/* 2. tmp0.z = lo (tmp0.x * src2) */
2954		if (ctx->bc->chip_class == CAYMAN) {
2955			for (j = 0 ; j < 4; j++) {
2956				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2957				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2958
2959				alu.dst.sel = tmp0;
2960				alu.dst.chan = j;
2961				alu.dst.write = (j == 2);
2962
2963				alu.src[0].sel = tmp0;
2964				alu.src[0].chan = 0;
2965				if (signed_op) {
2966					alu.src[1].sel = tmp2;
2967					alu.src[1].chan = 1;
2968				} else {
2969					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2970				}
2971
2972				alu.last = (j == 3);
2973				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2974					return r;
2975			}
2976		} else {
2977			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2978			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2979
2980			alu.dst.sel = tmp0;
2981			alu.dst.chan = 2;
2982			alu.dst.write = 1;
2983
2984			alu.src[0].sel = tmp0;
2985			alu.src[0].chan = 0;
2986			if (signed_op) {
2987				alu.src[1].sel = tmp2;
2988				alu.src[1].chan = 1;
2989			} else {
2990				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2991			}
2992
2993			alu.last = 1;
2994			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2995				return r;
2996		}
2997
2998		/* 3. tmp0.w = -tmp0.z */
2999		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3000		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3001
3002		alu.dst.sel = tmp0;
3003		alu.dst.chan = 3;
3004		alu.dst.write = 1;
3005
3006		alu.src[0].sel = V_SQ_ALU_SRC_0;
3007		alu.src[1].sel = tmp0;
3008		alu.src[1].chan = 2;
3009
3010		alu.last = 1;
3011		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3012			return r;
3013
3014		/* 4. tmp0.y = hi (tmp0.x * src2) */
3015		if (ctx->bc->chip_class == CAYMAN) {
3016			for (j = 0 ; j < 4; j++) {
3017				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3018				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3019
3020				alu.dst.sel = tmp0;
3021				alu.dst.chan = j;
3022				alu.dst.write = (j == 1);
3023
3024				alu.src[0].sel = tmp0;
3025				alu.src[0].chan = 0;
3026
3027				if (signed_op) {
3028					alu.src[1].sel = tmp2;
3029					alu.src[1].chan = 1;
3030				} else {
3031					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3032				}
3033				alu.last = (j == 3);
3034				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3035					return r;
3036			}
3037		} else {
3038			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3039			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3040
3041			alu.dst.sel = tmp0;
3042			alu.dst.chan = 1;
3043			alu.dst.write = 1;
3044
3045			alu.src[0].sel = tmp0;
3046			alu.src[0].chan = 0;
3047
3048			if (signed_op) {
3049				alu.src[1].sel = tmp2;
3050				alu.src[1].chan = 1;
3051			} else {
3052				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3053			}
3054
3055			alu.last = 1;
3056			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3057				return r;
3058		}
3059
3060		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
3061		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3062		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3063		alu.is_op3 = 1;
3064
3065		alu.dst.sel = tmp0;
3066		alu.dst.chan = 2;
3067		alu.dst.write = 1;
3068
3069		alu.src[0].sel = tmp0;
3070		alu.src[0].chan = 1;
3071		alu.src[1].sel = tmp0;
3072		alu.src[1].chan = 3;
3073		alu.src[2].sel = tmp0;
3074		alu.src[2].chan = 2;
3075
3076		alu.last = 1;
3077		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3078			return r;
3079
3080		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
3081		if (ctx->bc->chip_class == CAYMAN) {
3082			for (j = 0 ; j < 4; j++) {
3083				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3084				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3085
3086				alu.dst.sel = tmp0;
3087				alu.dst.chan = j;
3088				alu.dst.write = (j == 3);
3089
3090				alu.src[0].sel = tmp0;
3091				alu.src[0].chan = 2;
3092
3093				alu.src[1].sel = tmp0;
3094				alu.src[1].chan = 0;
3095
3096				alu.last = (j == 3);
3097				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3098					return r;
3099			}
3100		} else {
3101			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3102			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3103
3104			alu.dst.sel = tmp0;
3105			alu.dst.chan = 3;
3106			alu.dst.write = 1;
3107
3108			alu.src[0].sel = tmp0;
3109			alu.src[0].chan = 2;
3110
3111			alu.src[1].sel = tmp0;
3112			alu.src[1].chan = 0;
3113
3114			alu.last = 1;
3115			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3116				return r;
3117		}
3118
3119		/* 7. tmp1.x = tmp0.x - tmp0.w */
3120		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3121		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3122
3123		alu.dst.sel = tmp1;
3124		alu.dst.chan = 0;
3125		alu.dst.write = 1;
3126
3127		alu.src[0].sel = tmp0;
3128		alu.src[0].chan = 0;
3129		alu.src[1].sel = tmp0;
3130		alu.src[1].chan = 3;
3131
3132		alu.last = 1;
3133		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3134			return r;
3135
3136		/* 8. tmp1.y = tmp0.x + tmp0.w */
3137		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3138		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3139
3140		alu.dst.sel = tmp1;
3141		alu.dst.chan = 1;
3142		alu.dst.write = 1;
3143
3144		alu.src[0].sel = tmp0;
3145		alu.src[0].chan = 0;
3146		alu.src[1].sel = tmp0;
3147		alu.src[1].chan = 3;
3148
3149		alu.last = 1;
3150		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3151			return r;
3152
3153		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3154		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3155		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3156		alu.is_op3 = 1;
3157
3158		alu.dst.sel = tmp0;
3159		alu.dst.chan = 0;
3160		alu.dst.write = 1;
3161
3162		alu.src[0].sel = tmp0;
3163		alu.src[0].chan = 1;
3164		alu.src[1].sel = tmp1;
3165		alu.src[1].chan = 1;
3166		alu.src[2].sel = tmp1;
3167		alu.src[2].chan = 0;
3168
3169		alu.last = 1;
3170		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3171			return r;
3172
3173		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3174		if (ctx->bc->chip_class == CAYMAN) {
3175			for (j = 0 ; j < 4; j++) {
3176				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3177				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3178
3179				alu.dst.sel = tmp0;
3180				alu.dst.chan = j;
3181				alu.dst.write = (j == 2);
3182
3183				alu.src[0].sel = tmp0;
3184				alu.src[0].chan = 0;
3185
3186				if (signed_op) {
3187					alu.src[1].sel = tmp2;
3188					alu.src[1].chan = 0;
3189				} else {
3190					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3191				}
3192
3193				alu.last = (j == 3);
3194				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3195					return r;
3196			}
3197		} else {
3198			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3199			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3200
3201			alu.dst.sel = tmp0;
3202			alu.dst.chan = 2;
3203			alu.dst.write = 1;
3204
3205			alu.src[0].sel = tmp0;
3206			alu.src[0].chan = 0;
3207
3208			if (signed_op) {
3209				alu.src[1].sel = tmp2;
3210				alu.src[1].chan = 0;
3211			} else {
3212				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3213			}
3214
3215			alu.last = 1;
3216			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3217				return r;
3218		}
3219
3220		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3221		if (ctx->bc->chip_class == CAYMAN) {
3222			for (j = 0 ; j < 4; j++) {
3223				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3224				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3225
3226				alu.dst.sel = tmp0;
3227				alu.dst.chan = j;
3228				alu.dst.write = (j == 1);
3229
3230				if (signed_op) {
3231					alu.src[0].sel = tmp2;
3232					alu.src[0].chan = 1;
3233				} else {
3234					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3235				}
3236
3237				alu.src[1].sel = tmp0;
3238				alu.src[1].chan = 2;
3239
3240				alu.last = (j == 3);
3241				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3242					return r;
3243			}
3244		} else {
3245			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3246			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3247
3248			alu.dst.sel = tmp0;
3249			alu.dst.chan = 1;
3250			alu.dst.write = 1;
3251
3252			if (signed_op) {
3253				alu.src[0].sel = tmp2;
3254				alu.src[0].chan = 1;
3255			} else {
3256				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3257			}
3258
3259			alu.src[1].sel = tmp0;
3260			alu.src[1].chan = 2;
3261
3262			alu.last = 1;
3263			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3264				return r;
3265		}
3266
3267		/* 12. tmp0.w = src1 - tmp0.y       = r */
3268		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3269		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3270
3271		alu.dst.sel = tmp0;
3272		alu.dst.chan = 3;
3273		alu.dst.write = 1;
3274
3275		if (signed_op) {
3276			alu.src[0].sel = tmp2;
3277			alu.src[0].chan = 0;
3278		} else {
3279			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3280		}
3281
3282		alu.src[1].sel = tmp0;
3283		alu.src[1].chan = 1;
3284
3285		alu.last = 1;
3286		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3287			return r;
3288
3289		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3290		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3291		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3292
3293		alu.dst.sel = tmp1;
3294		alu.dst.chan = 0;
3295		alu.dst.write = 1;
3296
3297		alu.src[0].sel = tmp0;
3298		alu.src[0].chan = 3;
3299		if (signed_op) {
3300			alu.src[1].sel = tmp2;
3301			alu.src[1].chan = 1;
3302		} else {
3303			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3304		}
3305
3306		alu.last = 1;
3307		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3308			return r;
3309
3310		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3311		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3312		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3313
3314		alu.dst.sel = tmp1;
3315		alu.dst.chan = 1;
3316		alu.dst.write = 1;
3317
3318		if (signed_op) {
3319			alu.src[0].sel = tmp2;
3320			alu.src[0].chan = 0;
3321		} else {
3322			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3323		}
3324
3325		alu.src[1].sel = tmp0;
3326		alu.src[1].chan = 1;
3327
3328		alu.last = 1;
3329		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3330			return r;
3331
3332		if (mod) { /* UMOD */
3333
3334			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3335			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3336			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3337
3338			alu.dst.sel = tmp1;
3339			alu.dst.chan = 2;
3340			alu.dst.write = 1;
3341
3342			alu.src[0].sel = tmp0;
3343			alu.src[0].chan = 3;
3344
3345			if (signed_op) {
3346				alu.src[1].sel = tmp2;
3347				alu.src[1].chan = 1;
3348			} else {
3349				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3350			}
3351
3352			alu.last = 1;
3353			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3354				return r;
3355
3356			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3357			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3358			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3359
3360			alu.dst.sel = tmp1;
3361			alu.dst.chan = 3;
3362			alu.dst.write = 1;
3363
3364			alu.src[0].sel = tmp0;
3365			alu.src[0].chan = 3;
3366			if (signed_op) {
3367				alu.src[1].sel = tmp2;
3368				alu.src[1].chan = 1;
3369			} else {
3370				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3371			}
3372
3373			alu.last = 1;
3374			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3375				return r;
3376
3377		} else { /* UDIV */
3378
3379			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3380			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3381			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3382
3383			alu.dst.sel = tmp1;
3384			alu.dst.chan = 2;
3385			alu.dst.write = 1;
3386
3387			alu.src[0].sel = tmp0;
3388			alu.src[0].chan = 2;
3389			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3390
3391			alu.last = 1;
3392			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3393				return r;
3394
3395			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3396			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3397			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3398
3399			alu.dst.sel = tmp1;
3400			alu.dst.chan = 3;
3401			alu.dst.write = 1;
3402
3403			alu.src[0].sel = tmp0;
3404			alu.src[0].chan = 2;
3405			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3406
3407			alu.last = 1;
3408			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3409				return r;
3410
3411		}
3412
3413		/* 17. tmp1.x = tmp1.x & tmp1.y */
3414		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3415		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3416
3417		alu.dst.sel = tmp1;
3418		alu.dst.chan = 0;
3419		alu.dst.write = 1;
3420
3421		alu.src[0].sel = tmp1;
3422		alu.src[0].chan = 0;
3423		alu.src[1].sel = tmp1;
3424		alu.src[1].chan = 1;
3425
3426		alu.last = 1;
3427		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3428			return r;
3429
3430		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3431		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3432		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3433		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3434		alu.is_op3 = 1;
3435
3436		alu.dst.sel = tmp0;
3437		alu.dst.chan = 2;
3438		alu.dst.write = 1;
3439
3440		alu.src[0].sel = tmp1;
3441		alu.src[0].chan = 0;
3442		alu.src[1].sel = tmp0;
3443		alu.src[1].chan = mod ? 3 : 2;
3444		alu.src[2].sel = tmp1;
3445		alu.src[2].chan = 2;
3446
3447		alu.last = 1;
3448		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3449			return r;
3450
3451		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3452		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3453		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3454		alu.is_op3 = 1;
3455
3456		if (signed_op) {
3457			alu.dst.sel = tmp0;
3458			alu.dst.chan = 2;
3459			alu.dst.write = 1;
3460		} else {
3461			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3462		}
3463
3464		alu.src[0].sel = tmp1;
3465		alu.src[0].chan = 1;
3466		alu.src[1].sel = tmp1;
3467		alu.src[1].chan = 3;
3468		alu.src[2].sel = tmp0;
3469		alu.src[2].chan = 2;
3470
3471		alu.last = 1;
3472		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3473			return r;
3474
3475		if (signed_op) {
3476
3477			/* fix the sign of the result */
3478
3479			if (mod) {
3480
3481				/* tmp0.x = -tmp0.z */
3482				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3483				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3484
3485				alu.dst.sel = tmp0;
3486				alu.dst.chan = 0;
3487				alu.dst.write = 1;
3488
3489				alu.src[0].sel = V_SQ_ALU_SRC_0;
3490				alu.src[1].sel = tmp0;
3491				alu.src[1].chan = 2;
3492
3493				alu.last = 1;
3494				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3495					return r;
3496
3497				/* sign of the remainder is the same as the sign of src0 */
3498				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3499				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3500				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3501				alu.is_op3 = 1;
3502
3503				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3504
3505				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3506				alu.src[1].sel = tmp0;
3507				alu.src[1].chan = 2;
3508				alu.src[2].sel = tmp0;
3509				alu.src[2].chan = 0;
3510
3511				alu.last = 1;
3512				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3513					return r;
3514
3515			} else {
3516
3517				/* tmp0.x = -tmp0.z */
3518				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3519				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3520
3521				alu.dst.sel = tmp0;
3522				alu.dst.chan = 0;
3523				alu.dst.write = 1;
3524
3525				alu.src[0].sel = V_SQ_ALU_SRC_0;
3526				alu.src[1].sel = tmp0;
3527				alu.src[1].chan = 2;
3528
3529				alu.last = 1;
3530				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3531					return r;
3532
3533				/* fix the quotient sign (same as the sign of src0*src1) */
3534				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3535				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3536				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3537				alu.is_op3 = 1;
3538
3539				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3540
3541				alu.src[0].sel = tmp2;
3542				alu.src[0].chan = 2;
3543				alu.src[1].sel = tmp0;
3544				alu.src[1].chan = 2;
3545				alu.src[2].sel = tmp0;
3546				alu.src[2].chan = 0;
3547
3548				alu.last = 1;
3549				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3550					return r;
3551			}
3552		}
3553	}
3554	return 0;
3555}
3556
3557static int tgsi_udiv(struct r600_shader_ctx *ctx)
3558{
3559	return tgsi_divmod(ctx, 0, 0);
3560}
3561
3562static int tgsi_umod(struct r600_shader_ctx *ctx)
3563{
3564	return tgsi_divmod(ctx, 1, 0);
3565}
3566
3567static int tgsi_idiv(struct r600_shader_ctx *ctx)
3568{
3569	return tgsi_divmod(ctx, 0, 1);
3570}
3571
3572static int tgsi_imod(struct r600_shader_ctx *ctx)
3573{
3574	return tgsi_divmod(ctx, 1, 1);
3575}
3576
3577
3578static int tgsi_f2i(struct r600_shader_ctx *ctx)
3579{
3580	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3581	struct r600_bytecode_alu alu;
3582	int i, r;
3583	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3584	int last_inst = tgsi_last_instruction(write_mask);
3585
3586	for (i = 0; i < 4; i++) {
3587		if (!(write_mask & (1<<i)))
3588			continue;
3589
3590		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3591		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3592
3593		alu.dst.sel = ctx->temp_reg;
3594		alu.dst.chan = i;
3595		alu.dst.write = 1;
3596
3597		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3598		if (i == last_inst)
3599			alu.last = 1;
3600		r = r600_bytecode_add_alu(ctx->bc, &alu);
3601		if (r)
3602			return r;
3603	}
3604
3605	for (i = 0; i < 4; i++) {
3606		if (!(write_mask & (1<<i)))
3607			continue;
3608
3609		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3610		alu.inst = ctx->inst_info->r600_opcode;
3611
3612		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3613
3614		alu.src[0].sel = ctx->temp_reg;
3615		alu.src[0].chan = i;
3616
3617		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3618			alu.last = 1;
3619		r = r600_bytecode_add_alu(ctx->bc, &alu);
3620		if (r)
3621			return r;
3622	}
3623
3624	return 0;
3625}
3626
3627static int tgsi_iabs(struct r600_shader_ctx *ctx)
3628{
3629	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3630	struct r600_bytecode_alu alu;
3631	int i, r;
3632	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3633	int last_inst = tgsi_last_instruction(write_mask);
3634
3635	/* tmp = -src */
3636	for (i = 0; i < 4; i++) {
3637		if (!(write_mask & (1<<i)))
3638			continue;
3639
3640		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3641		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3642
3643		alu.dst.sel = ctx->temp_reg;
3644		alu.dst.chan = i;
3645		alu.dst.write = 1;
3646
3647		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3648		alu.src[0].sel = V_SQ_ALU_SRC_0;
3649
3650		if (i == last_inst)
3651			alu.last = 1;
3652		r = r600_bytecode_add_alu(ctx->bc, &alu);
3653		if (r)
3654			return r;
3655	}
3656
3657	/* dst = (src >= 0 ? src : tmp) */
3658	for (i = 0; i < 4; i++) {
3659		if (!(write_mask & (1<<i)))
3660			continue;
3661
3662		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3663		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3664		alu.is_op3 = 1;
3665		alu.dst.write = 1;
3666
3667		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3668
3669		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3670		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3671		alu.src[2].sel = ctx->temp_reg;
3672		alu.src[2].chan = i;
3673
3674		if (i == last_inst)
3675			alu.last = 1;
3676		r = r600_bytecode_add_alu(ctx->bc, &alu);
3677		if (r)
3678			return r;
3679	}
3680	return 0;
3681}
3682
3683static int tgsi_issg(struct r600_shader_ctx *ctx)
3684{
3685	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3686	struct r600_bytecode_alu alu;
3687	int i, r;
3688	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3689	int last_inst = tgsi_last_instruction(write_mask);
3690
3691	/* tmp = (src >= 0 ? src : -1) */
3692	for (i = 0; i < 4; i++) {
3693		if (!(write_mask & (1<<i)))
3694			continue;
3695
3696		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3697		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3698		alu.is_op3 = 1;
3699
3700		alu.dst.sel = ctx->temp_reg;
3701		alu.dst.chan = i;
3702		alu.dst.write = 1;
3703
3704		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3705		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3706		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3707
3708		if (i == last_inst)
3709			alu.last = 1;
3710		r = r600_bytecode_add_alu(ctx->bc, &alu);
3711		if (r)
3712			return r;
3713	}
3714
3715	/* dst = (tmp > 0 ? 1 : tmp) */
3716	for (i = 0; i < 4; i++) {
3717		if (!(write_mask & (1<<i)))
3718			continue;
3719
3720		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3721		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3722		alu.is_op3 = 1;
3723		alu.dst.write = 1;
3724
3725		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3726
3727		alu.src[0].sel = ctx->temp_reg;
3728		alu.src[0].chan = i;
3729
3730		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3731
3732		alu.src[2].sel = ctx->temp_reg;
3733		alu.src[2].chan = i;
3734
3735		if (i == last_inst)
3736			alu.last = 1;
3737		r = r600_bytecode_add_alu(ctx->bc, &alu);
3738		if (r)
3739			return r;
3740	}
3741	return 0;
3742}
3743
3744
3745
3746static int tgsi_ssg(struct r600_shader_ctx *ctx)
3747{
3748	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3749	struct r600_bytecode_alu alu;
3750	int i, r;
3751
3752	/* tmp = (src > 0 ? 1 : src) */
3753	for (i = 0; i < 4; i++) {
3754		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3755		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3756		alu.is_op3 = 1;
3757
3758		alu.dst.sel = ctx->temp_reg;
3759		alu.dst.chan = i;
3760
3761		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3762		alu.src[1].sel = V_SQ_ALU_SRC_1;
3763		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3764
3765		if (i == 3)
3766			alu.last = 1;
3767		r = r600_bytecode_add_alu(ctx->bc, &alu);
3768		if (r)
3769			return r;
3770	}
3771
3772	/* dst = (-tmp > 0 ? -1 : tmp) */
3773	for (i = 0; i < 4; i++) {
3774		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3775		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3776		alu.is_op3 = 1;
3777		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3778
3779		alu.src[0].sel = ctx->temp_reg;
3780		alu.src[0].chan = i;
3781		alu.src[0].neg = 1;
3782
3783		alu.src[1].sel = V_SQ_ALU_SRC_1;
3784		alu.src[1].neg = 1;
3785
3786		alu.src[2].sel = ctx->temp_reg;
3787		alu.src[2].chan = i;
3788
3789		if (i == 3)
3790			alu.last = 1;
3791		r = r600_bytecode_add_alu(ctx->bc, &alu);
3792		if (r)
3793			return r;
3794	}
3795	return 0;
3796}
3797
3798static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3799{
3800	struct r600_bytecode_alu alu;
3801	int i, r;
3802
3803	for (i = 0; i < 4; i++) {
3804		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3805		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3806			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3807			alu.dst.chan = i;
3808		} else {
3809			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3810			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3811			alu.src[0].sel = ctx->temp_reg;
3812			alu.src[0].chan = i;
3813		}
3814		if (i == 3) {
3815			alu.last = 1;
3816		}
3817		r = r600_bytecode_add_alu(ctx->bc, &alu);
3818		if (r)
3819			return r;
3820	}
3821	return 0;
3822}
3823
3824static int tgsi_op3(struct r600_shader_ctx *ctx)
3825{
3826	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3827	struct r600_bytecode_alu alu;
3828	int i, j, r;
3829	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3830
3831	for (i = 0; i < lasti + 1; i++) {
3832		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3833			continue;
3834
3835		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3836		alu.inst = ctx->inst_info->r600_opcode;
3837		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3838			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3839		}
3840
3841		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3842		alu.dst.chan = i;
3843		alu.dst.write = 1;
3844		alu.is_op3 = 1;
3845		if (i == lasti) {
3846			alu.last = 1;
3847		}
3848		r = r600_bytecode_add_alu(ctx->bc, &alu);
3849		if (r)
3850			return r;
3851	}
3852	return 0;
3853}
3854
3855static int tgsi_dp(struct r600_shader_ctx *ctx)
3856{
3857	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3858	struct r600_bytecode_alu alu;
3859	int i, j, r;
3860
3861	for (i = 0; i < 4; i++) {
3862		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3863		alu.inst = ctx->inst_info->r600_opcode;
3864		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3865			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3866		}
3867
3868		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3869		alu.dst.chan = i;
3870		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3871		/* handle some special cases */
3872		switch (ctx->inst_info->tgsi_opcode) {
3873		case TGSI_OPCODE_DP2:
3874			if (i > 1) {
3875				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3876				alu.src[0].chan = alu.src[1].chan = 0;
3877			}
3878			break;
3879		case TGSI_OPCODE_DP3:
3880			if (i > 2) {
3881				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3882				alu.src[0].chan = alu.src[1].chan = 0;
3883			}
3884			break;
3885		case TGSI_OPCODE_DPH:
3886			if (i == 3) {
3887				alu.src[0].sel = V_SQ_ALU_SRC_1;
3888				alu.src[0].chan = 0;
3889				alu.src[0].neg = 0;
3890			}
3891			break;
3892		default:
3893			break;
3894		}
3895		if (i == 3) {
3896			alu.last = 1;
3897		}
3898		r = r600_bytecode_add_alu(ctx->bc, &alu);
3899		if (r)
3900			return r;
3901	}
3902	return 0;
3903}
3904
3905static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3906						    unsigned index)
3907{
3908	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3909	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3910		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3911		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3912		ctx->src[index].neg || ctx->src[index].abs;
3913}
3914
3915static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3916					unsigned index)
3917{
3918	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3919	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3920}
3921
3922static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
3923{
3924	struct r600_bytecode_vtx vtx;
3925	struct r600_bytecode_alu alu;
3926	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3927	int src_gpr, r, i;
3928	int id = tgsi_tex_get_src_gpr(ctx, 1);
3929
3930	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3931	if (src_requires_loading) {
3932		for (i = 0; i < 4; i++) {
3933			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3934			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3935			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3936			alu.dst.sel = ctx->temp_reg;
3937			alu.dst.chan = i;
3938			if (i == 3)
3939				alu.last = 1;
3940			alu.dst.write = 1;
3941			r = r600_bytecode_add_alu(ctx->bc, &alu);
3942			if (r)
3943				return r;
3944		}
3945		src_gpr = ctx->temp_reg;
3946	}
3947
3948	memset(&vtx, 0, sizeof(vtx));
3949	vtx.inst = 0;
3950	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
3951	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
3952	vtx.src_gpr = src_gpr;
3953	vtx.mega_fetch_count = 16;
3954	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
3955	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
3956	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
3957	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
3958	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
3959	vtx.use_const_fields = 1;
3960	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
3961
3962	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
3963		return r;
3964
3965	if (ctx->bc->chip_class >= EVERGREEN)
3966		return 0;
3967
3968	for (i = 0; i < 4; i++) {
3969		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3970		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3971			continue;
3972
3973		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3974		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3975
3976		alu.dst.chan = i;
3977		alu.dst.sel = vtx.dst_gpr;
3978		alu.dst.write = 1;
3979
3980		alu.src[0].sel = vtx.dst_gpr;
3981		alu.src[0].chan = i;
3982
3983		alu.src[1].sel = 512 + (id * 2);
3984		alu.src[1].chan = i % 4;
3985		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3986
3987		if (i == lasti)
3988			alu.last = 1;
3989		r = r600_bytecode_add_alu(ctx->bc, &alu);
3990		if (r)
3991			return r;
3992	}
3993
3994	if (inst->Dst[0].Register.WriteMask & 3) {
3995		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3996		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT);
3997
3998		alu.dst.chan = 3;
3999		alu.dst.sel = vtx.dst_gpr;
4000		alu.dst.write = 1;
4001
4002		alu.src[0].sel = vtx.dst_gpr;
4003		alu.src[0].chan = 3;
4004
4005		alu.src[1].sel = 512 + (id * 2) + 1;
4006		alu.src[1].chan = 0;
4007		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4008
4009		alu.last = 1;
4010		r = r600_bytecode_add_alu(ctx->bc, &alu);
4011		if (r)
4012			return r;
4013	}
4014	return 0;
4015}
4016
4017static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
4018{
4019	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4020	struct r600_bytecode_alu alu;
4021	int r;
4022	int id = tgsi_tex_get_src_gpr(ctx, 1);
4023
4024	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4025	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4026
4027	if (ctx->bc->chip_class >= EVERGREEN) {
4028		alu.src[0].sel = 512 + (id / 4);
4029		alu.src[0].chan = id % 4;
4030	} else {
4031		/* r600 we have them at channel 2 of the second dword */
4032		alu.src[0].sel = 512 + (id * 2) + 1;
4033		alu.src[0].chan = 1;
4034	}
4035	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4036	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4037	alu.last = 1;
4038	r = r600_bytecode_add_alu(ctx->bc, &alu);
4039	if (r)
4040		return r;
4041	return 0;
4042}
4043
4044static int tgsi_tex(struct r600_shader_ctx *ctx)
4045{
4046	static float one_point_five = 1.5f;
4047	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4048	struct r600_bytecode_tex tex;
4049	struct r600_bytecode_alu alu;
4050	unsigned src_gpr;
4051	int r, i, j;
4052	int opcode;
4053	bool read_compressed_msaa = ctx->bc->msaa_texture_mode == MSAA_TEXTURE_COMPRESSED &&
4054				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4055				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
4056				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
4057	/* Texture fetch instructions can only use gprs as source.
4058	 * Also they cannot negate the source or take the absolute value */
4059	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
4060                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
4061					     read_compressed_msaa;
4062	boolean src_loaded = FALSE;
4063	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
4064	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
4065	boolean has_txq_cube_array_z = false;
4066
4067	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
4068	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4069	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
4070		if (inst->Dst[0].Register.WriteMask & 4) {
4071			ctx->shader->has_txq_cube_array_z_comp = true;
4072			has_txq_cube_array_z = true;
4073		}
4074
4075	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
4076	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4077	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4078		sampler_src_reg = 2;
4079
4080	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4081
4082	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
4083		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
4084			ctx->shader->uses_tex_buffers = true;
4085			return r600_do_buffer_txq(ctx);
4086		}
4087		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4088			if (ctx->bc->chip_class < EVERGREEN)
4089				ctx->shader->uses_tex_buffers = true;
4090			return do_vtx_fetch_inst(ctx, src_requires_loading);
4091		}
4092	}
4093
4094	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4095		/* get offset values */
4096		if (inst->Texture.NumOffsets) {
4097			assert(inst->Texture.NumOffsets == 1);
4098
4099			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
4100			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
4101			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
4102		}
4103	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
4104		/* TGSI moves the sampler to src reg 3 for TXD */
4105		sampler_src_reg = 3;
4106
4107		for (i = 1; i < 3; i++) {
4108			/* set gradients h/v */
4109			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4110			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
4111				SQ_TEX_INST_SET_GRADIENTS_V;
4112			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4113			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4114
4115			if (tgsi_tex_src_requires_loading(ctx, i)) {
4116				tex.src_gpr = r600_get_temp(ctx);
4117				tex.src_sel_x = 0;
4118				tex.src_sel_y = 1;
4119				tex.src_sel_z = 2;
4120				tex.src_sel_w = 3;
4121
4122				for (j = 0; j < 4; j++) {
4123					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4124					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4125                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
4126                                        alu.dst.sel = tex.src_gpr;
4127                                        alu.dst.chan = j;
4128                                        if (j == 3)
4129                                                alu.last = 1;
4130                                        alu.dst.write = 1;
4131                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
4132                                        if (r)
4133                                                return r;
4134				}
4135
4136			} else {
4137				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
4138				tex.src_sel_x = ctx->src[i].swizzle[0];
4139				tex.src_sel_y = ctx->src[i].swizzle[1];
4140				tex.src_sel_z = ctx->src[i].swizzle[2];
4141				tex.src_sel_w = ctx->src[i].swizzle[3];
4142				tex.src_rel = ctx->src[i].rel;
4143			}
4144			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
4145			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4146			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
4147				tex.coord_type_x = 1;
4148				tex.coord_type_y = 1;
4149				tex.coord_type_z = 1;
4150				tex.coord_type_w = 1;
4151			}
4152			r = r600_bytecode_add_tex(ctx->bc, &tex);
4153			if (r)
4154				return r;
4155		}
4156	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
4157		int out_chan;
4158		/* Add perspective divide */
4159		if (ctx->bc->chip_class == CAYMAN) {
4160			out_chan = 2;
4161			for (i = 0; i < 3; i++) {
4162				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4163				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4164				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4165
4166				alu.dst.sel = ctx->temp_reg;
4167				alu.dst.chan = i;
4168				if (i == 2)
4169					alu.last = 1;
4170				if (out_chan == i)
4171					alu.dst.write = 1;
4172				r = r600_bytecode_add_alu(ctx->bc, &alu);
4173				if (r)
4174					return r;
4175			}
4176
4177		} else {
4178			out_chan = 3;
4179			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4180			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4181			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4182
4183			alu.dst.sel = ctx->temp_reg;
4184			alu.dst.chan = out_chan;
4185			alu.last = 1;
4186			alu.dst.write = 1;
4187			r = r600_bytecode_add_alu(ctx->bc, &alu);
4188			if (r)
4189				return r;
4190		}
4191
4192		for (i = 0; i < 3; i++) {
4193			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4194			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4195			alu.src[0].sel = ctx->temp_reg;
4196			alu.src[0].chan = out_chan;
4197			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4198			alu.dst.sel = ctx->temp_reg;
4199			alu.dst.chan = i;
4200			alu.dst.write = 1;
4201			r = r600_bytecode_add_alu(ctx->bc, &alu);
4202			if (r)
4203				return r;
4204		}
4205		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4206		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4207		alu.src[0].sel = V_SQ_ALU_SRC_1;
4208		alu.src[0].chan = 0;
4209		alu.dst.sel = ctx->temp_reg;
4210		alu.dst.chan = 3;
4211		alu.last = 1;
4212		alu.dst.write = 1;
4213		r = r600_bytecode_add_alu(ctx->bc, &alu);
4214		if (r)
4215			return r;
4216		src_loaded = TRUE;
4217		src_gpr = ctx->temp_reg;
4218	}
4219
4220	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4221	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4222	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4223	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4224	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
4225	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
4226
4227		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
4228		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
4229
4230		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
4231		for (i = 0; i < 4; i++) {
4232			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4233			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
4234			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4235			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
4236			alu.dst.sel = ctx->temp_reg;
4237			alu.dst.chan = i;
4238			if (i == 3)
4239				alu.last = 1;
4240			alu.dst.write = 1;
4241			r = r600_bytecode_add_alu(ctx->bc, &alu);
4242			if (r)
4243				return r;
4244		}
4245
4246		/* tmp1.z = RCP_e(|tmp1.z|) */
4247		if (ctx->bc->chip_class == CAYMAN) {
4248			for (i = 0; i < 3; i++) {
4249				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4250				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4251				alu.src[0].sel = ctx->temp_reg;
4252				alu.src[0].chan = 2;
4253				alu.src[0].abs = 1;
4254				alu.dst.sel = ctx->temp_reg;
4255				alu.dst.chan = i;
4256				if (i == 2)
4257					alu.dst.write = 1;
4258				if (i == 2)
4259					alu.last = 1;
4260				r = r600_bytecode_add_alu(ctx->bc, &alu);
4261				if (r)
4262					return r;
4263			}
4264		} else {
4265			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4266			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4267			alu.src[0].sel = ctx->temp_reg;
4268			alu.src[0].chan = 2;
4269			alu.src[0].abs = 1;
4270			alu.dst.sel = ctx->temp_reg;
4271			alu.dst.chan = 2;
4272			alu.dst.write = 1;
4273			alu.last = 1;
4274			r = r600_bytecode_add_alu(ctx->bc, &alu);
4275			if (r)
4276				return r;
4277		}
4278
4279		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
4280		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
4281		 * muladd has no writemask, have to use another temp
4282		 */
4283		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4284		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4285		alu.is_op3 = 1;
4286
4287		alu.src[0].sel = ctx->temp_reg;
4288		alu.src[0].chan = 0;
4289		alu.src[1].sel = ctx->temp_reg;
4290		alu.src[1].chan = 2;
4291
4292		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4293		alu.src[2].chan = 0;
4294		alu.src[2].value = *(uint32_t *)&one_point_five;
4295
4296		alu.dst.sel = ctx->temp_reg;
4297		alu.dst.chan = 0;
4298		alu.dst.write = 1;
4299
4300		r = r600_bytecode_add_alu(ctx->bc, &alu);
4301		if (r)
4302			return r;
4303
4304		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4305		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4306		alu.is_op3 = 1;
4307
4308		alu.src[0].sel = ctx->temp_reg;
4309		alu.src[0].chan = 1;
4310		alu.src[1].sel = ctx->temp_reg;
4311		alu.src[1].chan = 2;
4312
4313		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4314		alu.src[2].chan = 0;
4315		alu.src[2].value = *(uint32_t *)&one_point_five;
4316
4317		alu.dst.sel = ctx->temp_reg;
4318		alu.dst.chan = 1;
4319		alu.dst.write = 1;
4320
4321		alu.last = 1;
4322		r = r600_bytecode_add_alu(ctx->bc, &alu);
4323		if (r)
4324			return r;
4325		/* write initial compare value into Z component
4326		  - W src 0 for shadow cube
4327		  - X src 1 for shadow cube array */
4328		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4329		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4330			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4331			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4332			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4333				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4334			else
4335				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4336			alu.dst.sel = ctx->temp_reg;
4337			alu.dst.chan = 2;
4338			alu.dst.write = 1;
4339			alu.last = 1;
4340			r = r600_bytecode_add_alu(ctx->bc, &alu);
4341			if (r)
4342				return r;
4343		}
4344
4345		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4346		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4347			if (ctx->bc->chip_class >= EVERGREEN) {
4348				int mytmp = r600_get_temp(ctx);
4349				static const float eight = 8.0f;
4350				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4351				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4352				alu.src[0].sel = ctx->temp_reg;
4353				alu.src[0].chan = 3;
4354				alu.dst.sel = mytmp;
4355				alu.dst.chan = 0;
4356				alu.dst.write = 1;
4357				alu.last = 1;
4358				r = r600_bytecode_add_alu(ctx->bc, &alu);
4359				if (r)
4360					return r;
4361
4362				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
4363				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4364				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4365				alu.is_op3 = 1;
4366				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4367				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4368				alu.src[1].chan = 0;
4369				alu.src[1].value = *(uint32_t *)&eight;
4370				alu.src[2].sel = mytmp;
4371				alu.src[2].chan = 0;
4372				alu.dst.sel = ctx->temp_reg;
4373				alu.dst.chan = 3;
4374				alu.dst.write = 1;
4375				alu.last = 1;
4376				r = r600_bytecode_add_alu(ctx->bc, &alu);
4377				if (r)
4378					return r;
4379			} else if (ctx->bc->chip_class < EVERGREEN) {
4380				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4381				tex.inst = SQ_TEX_INST_SET_CUBEMAP_INDEX;
4382				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4383				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4384				tex.src_gpr = r600_get_temp(ctx);
4385				tex.src_sel_x = 0;
4386				tex.src_sel_y = 0;
4387				tex.src_sel_z = 0;
4388				tex.src_sel_w = 0;
4389				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4390				tex.coord_type_x = 1;
4391				tex.coord_type_y = 1;
4392				tex.coord_type_z = 1;
4393				tex.coord_type_w = 1;
4394				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4395				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4396				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4397				alu.dst.sel = tex.src_gpr;
4398				alu.dst.chan = 0;
4399				alu.last = 1;
4400				alu.dst.write = 1;
4401				r = r600_bytecode_add_alu(ctx->bc, &alu);
4402				if (r)
4403					return r;
4404
4405				r = r600_bytecode_add_tex(ctx->bc, &tex);
4406				if (r)
4407					return r;
4408			}
4409
4410		}
4411
4412		/* for cube forms of lod and bias we need to route things */
4413		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
4414		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
4415		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4416		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
4417			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4418			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4419			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4420			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4421				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4422			else
4423				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4424			alu.dst.sel = ctx->temp_reg;
4425			alu.dst.chan = 2;
4426			alu.last = 1;
4427			alu.dst.write = 1;
4428			r = r600_bytecode_add_alu(ctx->bc, &alu);
4429			if (r)
4430				return r;
4431		}
4432
4433		src_loaded = TRUE;
4434		src_gpr = ctx->temp_reg;
4435	}
4436
4437	if (src_requires_loading && !src_loaded) {
4438		for (i = 0; i < 4; i++) {
4439			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4440			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4441			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4442			alu.dst.sel = ctx->temp_reg;
4443			alu.dst.chan = i;
4444			if (i == 3)
4445				alu.last = 1;
4446			alu.dst.write = 1;
4447			r = r600_bytecode_add_alu(ctx->bc, &alu);
4448			if (r)
4449				return r;
4450		}
4451		src_loaded = TRUE;
4452		src_gpr = ctx->temp_reg;
4453	}
4454
4455	/* Obtain the sample index for reading a compressed MSAA color texture.
4456	 * To read the FMASK, we use the ldfptr instruction, which tells us
4457	 * where the samples are stored.
4458	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
4459	 * which is the identity mapping. Each nibble says which physical sample
4460	 * should be fetched to get that sample.
4461	 *
4462	 * Assume src.z contains the sample index. It should be modified like this:
4463	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
4464	 * Then fetch the texel with src.
4465	 */
4466	if (read_compressed_msaa) {
4467		unsigned sample_chan = inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ? 3 : 4;
4468		unsigned temp = r600_get_temp(ctx);
4469		assert(src_loaded);
4470
4471		/* temp.w = ldfptr() */
4472		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4473		tex.inst = SQ_TEX_INST_LD;
4474		tex.inst_mod = 1; /* to indicate this is ldfptr */
4475		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4476		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4477		tex.src_gpr = src_gpr;
4478		tex.dst_gpr = temp;
4479		tex.dst_sel_x = 7; /* mask out these components */
4480		tex.dst_sel_y = 7;
4481		tex.dst_sel_z = 7;
4482		tex.dst_sel_w = 0; /* store X */
4483		tex.src_sel_x = 0;
4484		tex.src_sel_y = 1;
4485		tex.src_sel_z = 2;
4486		tex.src_sel_w = 3;
4487		tex.offset_x = offset_x;
4488		tex.offset_y = offset_y;
4489		tex.offset_z = offset_z;
4490		r = r600_bytecode_add_tex(ctx->bc, &tex);
4491		if (r)
4492			return r;
4493
4494		/* temp.x = sample_index*4 */
4495		if (ctx->bc->chip_class == CAYMAN) {
4496			for (i = 0 ; i < 4; i++) {
4497				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4498				alu.inst = ctx->inst_info->r600_opcode;
4499				alu.src[0].sel = src_gpr;
4500				alu.src[0].chan = sample_chan;
4501				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4502				alu.src[1].value = 4;
4503				alu.dst.sel = temp;
4504				alu.dst.chan = i;
4505				alu.dst.write = i == 0;
4506				if (i == 3)
4507					alu.last = 1;
4508				r = r600_bytecode_add_alu(ctx->bc, &alu);
4509				if (r)
4510					return r;
4511			}
4512		} else {
4513			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4514			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT);
4515			alu.src[0].sel = src_gpr;
4516			alu.src[0].chan = sample_chan;
4517			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4518			alu.src[1].value = 4;
4519			alu.dst.sel = temp;
4520			alu.dst.chan = 0;
4521			alu.dst.write = 1;
4522			alu.last = 1;
4523			r = r600_bytecode_add_alu(ctx->bc, &alu);
4524			if (r)
4525				return r;
4526		}
4527
4528		/* sample_index = temp.w >> temp.x */
4529		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4530		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT);
4531		alu.src[0].sel = temp;
4532		alu.src[0].chan = 3;
4533		alu.src[1].sel = temp;
4534		alu.src[1].chan = 0;
4535		alu.dst.sel = src_gpr;
4536		alu.dst.chan = sample_chan;
4537		alu.dst.write = 1;
4538		alu.last = 1;
4539		r = r600_bytecode_add_alu(ctx->bc, &alu);
4540		if (r)
4541			return r;
4542
4543		/* sample_index & 0xF */
4544		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4545		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
4546		alu.src[0].sel = src_gpr;
4547		alu.src[0].chan = sample_chan;
4548		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4549		alu.src[1].value = 0xF;
4550		alu.dst.sel = src_gpr;
4551		alu.dst.chan = sample_chan;
4552		alu.dst.write = 1;
4553		alu.last = 1;
4554		r = r600_bytecode_add_alu(ctx->bc, &alu);
4555		if (r)
4556			return r;
4557#if 0
4558		/* visualize the FMASK */
4559		for (i = 0; i < 4; i++) {
4560			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4561			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
4562			alu.src[0].sel = src_gpr;
4563			alu.src[0].chan = sample_chan;
4564			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4565			alu.dst.chan = i;
4566			alu.dst.write = 1;
4567			alu.last = 1;
4568			r = r600_bytecode_add_alu(ctx->bc, &alu);
4569			if (r)
4570				return r;
4571		}
4572		return 0;
4573#endif
4574	}
4575
4576	/* does this shader want a num layers from TXQ for a cube array? */
4577	if (has_txq_cube_array_z) {
4578		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4579
4580		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4581		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4582
4583		alu.src[0].sel = 512 + (id / 4);
4584		alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER;
4585		alu.src[0].chan = id % 4;
4586		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4587		alu.last = 1;
4588		r = r600_bytecode_add_alu(ctx->bc, &alu);
4589		if (r)
4590			return r;
4591		/* disable writemask from texture instruction */
4592		inst->Dst[0].Register.WriteMask &= ~4;
4593	}
4594
4595	opcode = ctx->inst_info->r600_opcode;
4596	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4597	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4598	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4599	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4600	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4601	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
4602	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4603		switch (opcode) {
4604		case SQ_TEX_INST_SAMPLE:
4605			opcode = SQ_TEX_INST_SAMPLE_C;
4606			break;
4607		case SQ_TEX_INST_SAMPLE_L:
4608			opcode = SQ_TEX_INST_SAMPLE_C_L;
4609			break;
4610		case SQ_TEX_INST_SAMPLE_LB:
4611			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4612			break;
4613		case SQ_TEX_INST_SAMPLE_G:
4614			opcode = SQ_TEX_INST_SAMPLE_C_G;
4615			break;
4616		}
4617	}
4618
4619	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4620	tex.inst = opcode;
4621
4622	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4623	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4624	tex.src_gpr = src_gpr;
4625	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4626	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4627	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4628	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4629	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4630
4631	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4632		tex.src_sel_x = 4;
4633		tex.src_sel_y = 4;
4634		tex.src_sel_z = 4;
4635		tex.src_sel_w = 4;
4636	} else if (src_loaded) {
4637		tex.src_sel_x = 0;
4638		tex.src_sel_y = 1;
4639		tex.src_sel_z = 2;
4640		tex.src_sel_w = 3;
4641	} else {
4642		tex.src_sel_x = ctx->src[0].swizzle[0];
4643		tex.src_sel_y = ctx->src[0].swizzle[1];
4644		tex.src_sel_z = ctx->src[0].swizzle[2];
4645		tex.src_sel_w = ctx->src[0].swizzle[3];
4646		tex.src_rel = ctx->src[0].rel;
4647	}
4648
4649	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4650	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4651	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4652	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4653		tex.src_sel_x = 1;
4654		tex.src_sel_y = 0;
4655		tex.src_sel_z = 3;
4656		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
4657	}
4658
4659	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4660	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4661		tex.coord_type_x = 1;
4662		tex.coord_type_y = 1;
4663	}
4664	tex.coord_type_z = 1;
4665	tex.coord_type_w = 1;
4666
4667	tex.offset_x = offset_x;
4668	tex.offset_y = offset_y;
4669	tex.offset_z = offset_z;
4670
4671	/* Put the depth for comparison in W.
4672	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4673	 * Some instructions expect the depth in Z. */
4674	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4675	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4676	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4677	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4678	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4679	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4680		tex.src_sel_w = tex.src_sel_z;
4681	}
4682
4683	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4684	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4685		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4686		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4687			/* the array index is read from Y */
4688			tex.coord_type_y = 0;
4689		} else {
4690			/* the array index is read from Z */
4691			tex.coord_type_z = 0;
4692			tex.src_sel_z = tex.src_sel_y;
4693		}
4694	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4695		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
4696		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4697		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4698		    (ctx->bc->chip_class >= EVERGREEN)))
4699		/* the array index is read from Z */
4700		tex.coord_type_z = 0;
4701
4702	r = r600_bytecode_add_tex(ctx->bc, &tex);
4703	if (r)
4704		return r;
4705
4706	/* add shadow ambient support  - gallium doesn't do it yet */
4707	return 0;
4708}
4709
4710static int tgsi_lrp(struct r600_shader_ctx *ctx)
4711{
4712	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4713	struct r600_bytecode_alu alu;
4714	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4715	unsigned i;
4716	int r;
4717
4718	/* optimize if it's just an equal balance */
4719	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4720		for (i = 0; i < lasti + 1; i++) {
4721			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4722				continue;
4723
4724			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4725			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4726			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4727			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4728			alu.omod = 3;
4729			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4730			alu.dst.chan = i;
4731			if (i == lasti) {
4732				alu.last = 1;
4733			}
4734			r = r600_bytecode_add_alu(ctx->bc, &alu);
4735			if (r)
4736				return r;
4737		}
4738		return 0;
4739	}
4740
4741	/* 1 - src0 */
4742	for (i = 0; i < lasti + 1; i++) {
4743		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4744			continue;
4745
4746		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4747		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4748		alu.src[0].sel = V_SQ_ALU_SRC_1;
4749		alu.src[0].chan = 0;
4750		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4751		r600_bytecode_src_toggle_neg(&alu.src[1]);
4752		alu.dst.sel = ctx->temp_reg;
4753		alu.dst.chan = i;
4754		if (i == lasti) {
4755			alu.last = 1;
4756		}
4757		alu.dst.write = 1;
4758		r = r600_bytecode_add_alu(ctx->bc, &alu);
4759		if (r)
4760			return r;
4761	}
4762
4763	/* (1 - src0) * src2 */
4764	for (i = 0; i < lasti + 1; i++) {
4765		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4766			continue;
4767
4768		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4769		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4770		alu.src[0].sel = ctx->temp_reg;
4771		alu.src[0].chan = i;
4772		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4773		alu.dst.sel = ctx->temp_reg;
4774		alu.dst.chan = i;
4775		if (i == lasti) {
4776			alu.last = 1;
4777		}
4778		alu.dst.write = 1;
4779		r = r600_bytecode_add_alu(ctx->bc, &alu);
4780		if (r)
4781			return r;
4782	}
4783
4784	/* src0 * src1 + (1 - src0) * src2 */
4785	for (i = 0; i < lasti + 1; i++) {
4786		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4787			continue;
4788
4789		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4790		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4791		alu.is_op3 = 1;
4792		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4793		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4794		alu.src[2].sel = ctx->temp_reg;
4795		alu.src[2].chan = i;
4796
4797		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4798		alu.dst.chan = i;
4799		if (i == lasti) {
4800			alu.last = 1;
4801		}
4802		r = r600_bytecode_add_alu(ctx->bc, &alu);
4803		if (r)
4804			return r;
4805	}
4806	return 0;
4807}
4808
4809static int tgsi_cmp(struct r600_shader_ctx *ctx)
4810{
4811	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4812	struct r600_bytecode_alu alu;
4813	int i, r;
4814	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4815
4816	for (i = 0; i < lasti + 1; i++) {
4817		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4818			continue;
4819
4820		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4821		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4822		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4823		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4824		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4825		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4826		alu.dst.chan = i;
4827		alu.dst.write = 1;
4828		alu.is_op3 = 1;
4829		if (i == lasti)
4830			alu.last = 1;
4831		r = r600_bytecode_add_alu(ctx->bc, &alu);
4832		if (r)
4833			return r;
4834	}
4835	return 0;
4836}
4837
4838static int tgsi_ucmp(struct r600_shader_ctx *ctx)
4839{
4840	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4841	struct r600_bytecode_alu alu;
4842	int i, r;
4843	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4844
4845	for (i = 0; i < lasti + 1; i++) {
4846		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4847			continue;
4848
4849		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4850		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
4851		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4852		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4853		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4854		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4855		alu.dst.chan = i;
4856		alu.dst.write = 1;
4857		alu.is_op3 = 1;
4858		if (i == lasti)
4859			alu.last = 1;
4860		r = r600_bytecode_add_alu(ctx->bc, &alu);
4861		if (r)
4862			return r;
4863	}
4864	return 0;
4865}
4866
4867static int tgsi_xpd(struct r600_shader_ctx *ctx)
4868{
4869	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4870	static const unsigned int src0_swizzle[] = {2, 0, 1};
4871	static const unsigned int src1_swizzle[] = {1, 2, 0};
4872	struct r600_bytecode_alu alu;
4873	uint32_t use_temp = 0;
4874	int i, r;
4875
4876	if (inst->Dst[0].Register.WriteMask != 0xf)
4877		use_temp = 1;
4878
4879	for (i = 0; i < 4; i++) {
4880		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4881		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4882		if (i < 3) {
4883			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4884			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4885		} else {
4886			alu.src[0].sel = V_SQ_ALU_SRC_0;
4887			alu.src[0].chan = i;
4888			alu.src[1].sel = V_SQ_ALU_SRC_0;
4889			alu.src[1].chan = i;
4890		}
4891
4892		alu.dst.sel = ctx->temp_reg;
4893		alu.dst.chan = i;
4894		alu.dst.write = 1;
4895
4896		if (i == 3)
4897			alu.last = 1;
4898		r = r600_bytecode_add_alu(ctx->bc, &alu);
4899		if (r)
4900			return r;
4901	}
4902
4903	for (i = 0; i < 4; i++) {
4904		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4905		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4906
4907		if (i < 3) {
4908			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4909			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4910		} else {
4911			alu.src[0].sel = V_SQ_ALU_SRC_0;
4912			alu.src[0].chan = i;
4913			alu.src[1].sel = V_SQ_ALU_SRC_0;
4914			alu.src[1].chan = i;
4915		}
4916
4917		alu.src[2].sel = ctx->temp_reg;
4918		alu.src[2].neg = 1;
4919		alu.src[2].chan = i;
4920
4921		if (use_temp)
4922			alu.dst.sel = ctx->temp_reg;
4923		else
4924			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4925		alu.dst.chan = i;
4926		alu.dst.write = 1;
4927		alu.is_op3 = 1;
4928		if (i == 3)
4929			alu.last = 1;
4930		r = r600_bytecode_add_alu(ctx->bc, &alu);
4931		if (r)
4932			return r;
4933	}
4934	if (use_temp)
4935		return tgsi_helper_copy(ctx, inst);
4936	return 0;
4937}
4938
4939static int tgsi_exp(struct r600_shader_ctx *ctx)
4940{
4941	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4942	struct r600_bytecode_alu alu;
4943	int r;
4944	int i;
4945
4946	/* result.x = 2^floor(src); */
4947	if (inst->Dst[0].Register.WriteMask & 1) {
4948		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4949
4950		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4951		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4952
4953		alu.dst.sel = ctx->temp_reg;
4954		alu.dst.chan = 0;
4955		alu.dst.write = 1;
4956		alu.last = 1;
4957		r = r600_bytecode_add_alu(ctx->bc, &alu);
4958		if (r)
4959			return r;
4960
4961		if (ctx->bc->chip_class == CAYMAN) {
4962			for (i = 0; i < 3; i++) {
4963				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4964				alu.src[0].sel = ctx->temp_reg;
4965				alu.src[0].chan = 0;
4966
4967				alu.dst.sel = ctx->temp_reg;
4968				alu.dst.chan = i;
4969				alu.dst.write = i == 0;
4970				alu.last = i == 2;
4971				r = r600_bytecode_add_alu(ctx->bc, &alu);
4972				if (r)
4973					return r;
4974			}
4975		} else {
4976			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4977			alu.src[0].sel = ctx->temp_reg;
4978			alu.src[0].chan = 0;
4979
4980			alu.dst.sel = ctx->temp_reg;
4981			alu.dst.chan = 0;
4982			alu.dst.write = 1;
4983			alu.last = 1;
4984			r = r600_bytecode_add_alu(ctx->bc, &alu);
4985			if (r)
4986				return r;
4987		}
4988	}
4989
4990	/* result.y = tmp - floor(tmp); */
4991	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4992		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4993
4994		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4995		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4996
4997		alu.dst.sel = ctx->temp_reg;
4998#if 0
4999		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5000		if (r)
5001			return r;
5002#endif
5003		alu.dst.write = 1;
5004		alu.dst.chan = 1;
5005
5006		alu.last = 1;
5007
5008		r = r600_bytecode_add_alu(ctx->bc, &alu);
5009		if (r)
5010			return r;
5011	}
5012
5013	/* result.z = RoughApprox2ToX(tmp);*/
5014	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
5015		if (ctx->bc->chip_class == CAYMAN) {
5016			for (i = 0; i < 3; i++) {
5017				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5018				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5019				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5020
5021				alu.dst.sel = ctx->temp_reg;
5022				alu.dst.chan = i;
5023				if (i == 2) {
5024					alu.dst.write = 1;
5025					alu.last = 1;
5026				}
5027
5028				r = r600_bytecode_add_alu(ctx->bc, &alu);
5029				if (r)
5030					return r;
5031			}
5032		} else {
5033			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5034			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5035			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5036
5037			alu.dst.sel = ctx->temp_reg;
5038			alu.dst.write = 1;
5039			alu.dst.chan = 2;
5040
5041			alu.last = 1;
5042
5043			r = r600_bytecode_add_alu(ctx->bc, &alu);
5044			if (r)
5045				return r;
5046		}
5047	}
5048
5049	/* result.w = 1.0;*/
5050	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
5051		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5052
5053		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
5054		alu.src[0].sel = V_SQ_ALU_SRC_1;
5055		alu.src[0].chan = 0;
5056
5057		alu.dst.sel = ctx->temp_reg;
5058		alu.dst.chan = 3;
5059		alu.dst.write = 1;
5060		alu.last = 1;
5061		r = r600_bytecode_add_alu(ctx->bc, &alu);
5062		if (r)
5063			return r;
5064	}
5065	return tgsi_helper_copy(ctx, inst);
5066}
5067
5068static int tgsi_log(struct r600_shader_ctx *ctx)
5069{
5070	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5071	struct r600_bytecode_alu alu;
5072	int r;
5073	int i;
5074
5075	/* result.x = floor(log2(|src|)); */
5076	if (inst->Dst[0].Register.WriteMask & 1) {
5077		if (ctx->bc->chip_class == CAYMAN) {
5078			for (i = 0; i < 3; i++) {
5079				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5080
5081				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5082				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5083				r600_bytecode_src_set_abs(&alu.src[0]);
5084
5085				alu.dst.sel = ctx->temp_reg;
5086				alu.dst.chan = i;
5087				if (i == 0)
5088					alu.dst.write = 1;
5089				if (i == 2)
5090					alu.last = 1;
5091				r = r600_bytecode_add_alu(ctx->bc, &alu);
5092				if (r)
5093					return r;
5094			}
5095
5096		} else {
5097			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5098
5099			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5100			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5101			r600_bytecode_src_set_abs(&alu.src[0]);
5102
5103			alu.dst.sel = ctx->temp_reg;
5104			alu.dst.chan = 0;
5105			alu.dst.write = 1;
5106			alu.last = 1;
5107			r = r600_bytecode_add_alu(ctx->bc, &alu);
5108			if (r)
5109				return r;
5110		}
5111
5112		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
5113		alu.src[0].sel = ctx->temp_reg;
5114		alu.src[0].chan = 0;
5115
5116		alu.dst.sel = ctx->temp_reg;
5117		alu.dst.chan = 0;
5118		alu.dst.write = 1;
5119		alu.last = 1;
5120
5121		r = r600_bytecode_add_alu(ctx->bc, &alu);
5122		if (r)
5123			return r;
5124	}
5125
5126	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
5127	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5128
5129		if (ctx->bc->chip_class == CAYMAN) {
5130			for (i = 0; i < 3; i++) {
5131				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5132
5133				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5134				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5135				r600_bytecode_src_set_abs(&alu.src[0]);
5136
5137				alu.dst.sel = ctx->temp_reg;
5138				alu.dst.chan = i;
5139				if (i == 1)
5140					alu.dst.write = 1;
5141				if (i == 2)
5142					alu.last = 1;
5143
5144				r = r600_bytecode_add_alu(ctx->bc, &alu);
5145				if (r)
5146					return r;
5147			}
5148		} else {
5149			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5150
5151			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5152			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5153			r600_bytecode_src_set_abs(&alu.src[0]);
5154
5155			alu.dst.sel = ctx->temp_reg;
5156			alu.dst.chan = 1;
5157			alu.dst.write = 1;
5158			alu.last = 1;
5159
5160			r = r600_bytecode_add_alu(ctx->bc, &alu);
5161			if (r)
5162				return r;
5163		}
5164
5165		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5166
5167		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
5168		alu.src[0].sel = ctx->temp_reg;
5169		alu.src[0].chan = 1;
5170
5171		alu.dst.sel = ctx->temp_reg;
5172		alu.dst.chan = 1;
5173		alu.dst.write = 1;
5174		alu.last = 1;
5175
5176		r = r600_bytecode_add_alu(ctx->bc, &alu);
5177		if (r)
5178			return r;
5179
5180		if (ctx->bc->chip_class == CAYMAN) {
5181			for (i = 0; i < 3; i++) {
5182				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5184				alu.src[0].sel = ctx->temp_reg;
5185				alu.src[0].chan = 1;
5186
5187				alu.dst.sel = ctx->temp_reg;
5188				alu.dst.chan = i;
5189				if (i == 1)
5190					alu.dst.write = 1;
5191				if (i == 2)
5192					alu.last = 1;
5193
5194				r = r600_bytecode_add_alu(ctx->bc, &alu);
5195				if (r)
5196					return r;
5197			}
5198		} else {
5199			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5200			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5201			alu.src[0].sel = ctx->temp_reg;
5202			alu.src[0].chan = 1;
5203
5204			alu.dst.sel = ctx->temp_reg;
5205			alu.dst.chan = 1;
5206			alu.dst.write = 1;
5207			alu.last = 1;
5208
5209			r = r600_bytecode_add_alu(ctx->bc, &alu);
5210			if (r)
5211				return r;
5212		}
5213
5214		if (ctx->bc->chip_class == CAYMAN) {
5215			for (i = 0; i < 3; i++) {
5216				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5217				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
5218				alu.src[0].sel = ctx->temp_reg;
5219				alu.src[0].chan = 1;
5220
5221				alu.dst.sel = ctx->temp_reg;
5222				alu.dst.chan = i;
5223				if (i == 1)
5224					alu.dst.write = 1;
5225				if (i == 2)
5226					alu.last = 1;
5227
5228				r = r600_bytecode_add_alu(ctx->bc, &alu);
5229				if (r)
5230					return r;
5231			}
5232		} else {
5233			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5234			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
5235			alu.src[0].sel = ctx->temp_reg;
5236			alu.src[0].chan = 1;
5237
5238			alu.dst.sel = ctx->temp_reg;
5239			alu.dst.chan = 1;
5240			alu.dst.write = 1;
5241			alu.last = 1;
5242
5243			r = r600_bytecode_add_alu(ctx->bc, &alu);
5244			if (r)
5245				return r;
5246		}
5247
5248		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5249
5250		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
5251
5252		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5253		r600_bytecode_src_set_abs(&alu.src[0]);
5254
5255		alu.src[1].sel = ctx->temp_reg;
5256		alu.src[1].chan = 1;
5257
5258		alu.dst.sel = ctx->temp_reg;
5259		alu.dst.chan = 1;
5260		alu.dst.write = 1;
5261		alu.last = 1;
5262
5263		r = r600_bytecode_add_alu(ctx->bc, &alu);
5264		if (r)
5265			return r;
5266	}
5267
5268	/* result.z = log2(|src|);*/
5269	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
5270		if (ctx->bc->chip_class == CAYMAN) {
5271			for (i = 0; i < 3; i++) {
5272				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5273
5274				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5275				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5276				r600_bytecode_src_set_abs(&alu.src[0]);
5277
5278				alu.dst.sel = ctx->temp_reg;
5279				if (i == 2)
5280					alu.dst.write = 1;
5281				alu.dst.chan = i;
5282				if (i == 2)
5283					alu.last = 1;
5284
5285				r = r600_bytecode_add_alu(ctx->bc, &alu);
5286				if (r)
5287					return r;
5288			}
5289		} else {
5290			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5291
5292			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5293			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5294			r600_bytecode_src_set_abs(&alu.src[0]);
5295
5296			alu.dst.sel = ctx->temp_reg;
5297			alu.dst.write = 1;
5298			alu.dst.chan = 2;
5299			alu.last = 1;
5300
5301			r = r600_bytecode_add_alu(ctx->bc, &alu);
5302			if (r)
5303				return r;
5304		}
5305	}
5306
5307	/* result.w = 1.0; */
5308	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
5309		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5310
5311		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
5312		alu.src[0].sel = V_SQ_ALU_SRC_1;
5313		alu.src[0].chan = 0;
5314
5315		alu.dst.sel = ctx->temp_reg;
5316		alu.dst.chan = 3;
5317		alu.dst.write = 1;
5318		alu.last = 1;
5319
5320		r = r600_bytecode_add_alu(ctx->bc, &alu);
5321		if (r)
5322			return r;
5323	}
5324
5325	return tgsi_helper_copy(ctx, inst);
5326}
5327
5328static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
5329{
5330	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5331	struct r600_bytecode_alu alu;
5332	int r;
5333
5334	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5335
5336	switch (inst->Instruction.Opcode) {
5337	case TGSI_OPCODE_ARL:
5338		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
5339		break;
5340	case TGSI_OPCODE_ARR:
5341		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5342		break;
5343	case TGSI_OPCODE_UARL:
5344		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
5345		break;
5346	default:
5347		assert(0);
5348		return -1;
5349	}
5350
5351	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5352	alu.last = 1;
5353	alu.dst.sel = ctx->bc->ar_reg;
5354	alu.dst.write = 1;
5355	r = r600_bytecode_add_alu(ctx->bc, &alu);
5356	if (r)
5357		return r;
5358
5359	ctx->bc->ar_loaded = 0;
5360	return 0;
5361}
5362static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
5363{
5364	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5365	struct r600_bytecode_alu alu;
5366	int r;
5367
5368	switch (inst->Instruction.Opcode) {
5369	case TGSI_OPCODE_ARL:
5370		memset(&alu, 0, sizeof(alu));
5371		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
5372		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5373		alu.dst.sel = ctx->bc->ar_reg;
5374		alu.dst.write = 1;
5375		alu.last = 1;
5376
5377		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5378			return r;
5379
5380		memset(&alu, 0, sizeof(alu));
5381		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5382		alu.src[0].sel = ctx->bc->ar_reg;
5383		alu.dst.sel = ctx->bc->ar_reg;
5384		alu.dst.write = 1;
5385		alu.last = 1;
5386
5387		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5388			return r;
5389		break;
5390	case TGSI_OPCODE_ARR:
5391		memset(&alu, 0, sizeof(alu));
5392		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5393		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5394		alu.dst.sel = ctx->bc->ar_reg;
5395		alu.dst.write = 1;
5396		alu.last = 1;
5397
5398		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5399			return r;
5400		break;
5401	case TGSI_OPCODE_UARL:
5402		memset(&alu, 0, sizeof(alu));
5403		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
5404		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5405		alu.dst.sel = ctx->bc->ar_reg;
5406		alu.dst.write = 1;
5407		alu.last = 1;
5408
5409		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5410			return r;
5411		break;
5412	default:
5413		assert(0);
5414		return -1;
5415	}
5416
5417	ctx->bc->ar_loaded = 0;
5418	return 0;
5419}
5420
5421static int tgsi_opdst(struct r600_shader_ctx *ctx)
5422{
5423	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5424	struct r600_bytecode_alu alu;
5425	int i, r = 0;
5426
5427	for (i = 0; i < 4; i++) {
5428		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5429
5430		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
5431		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5432
5433		if (i == 0 || i == 3) {
5434			alu.src[0].sel = V_SQ_ALU_SRC_1;
5435		} else {
5436			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5437		}
5438
5439		if (i == 0 || i == 2) {
5440			alu.src[1].sel = V_SQ_ALU_SRC_1;
5441		} else {
5442			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5443		}
5444		if (i == 3)
5445			alu.last = 1;
5446		r = r600_bytecode_add_alu(ctx->bc, &alu);
5447		if (r)
5448			return r;
5449	}
5450	return 0;
5451}
5452
5453static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
5454{
5455	struct r600_bytecode_alu alu;
5456	int r;
5457
5458	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5459	alu.inst = opcode;
5460	alu.execute_mask = 1;
5461	alu.update_pred = 1;
5462
5463	alu.dst.sel = ctx->temp_reg;
5464	alu.dst.write = 1;
5465	alu.dst.chan = 0;
5466
5467	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5468	alu.src[1].sel = V_SQ_ALU_SRC_0;
5469	alu.src[1].chan = 0;
5470
5471	alu.last = 1;
5472
5473	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
5474	if (r)
5475		return r;
5476	return 0;
5477}
5478
5479static int pops(struct r600_shader_ctx *ctx, int pops)
5480{
5481	unsigned force_pop = ctx->bc->force_add_cf;
5482
5483	if (!force_pop) {
5484		int alu_pop = 3;
5485		if (ctx->bc->cf_last) {
5486			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
5487				alu_pop = 0;
5488			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
5489				alu_pop = 1;
5490		}
5491		alu_pop += pops;
5492		if (alu_pop == 1) {
5493			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
5494			ctx->bc->force_add_cf = 1;
5495		} else if (alu_pop == 2) {
5496			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
5497			ctx->bc->force_add_cf = 1;
5498		} else {
5499			force_pop = 1;
5500		}
5501	}
5502
5503	if (force_pop) {
5504		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
5505		ctx->bc->cf_last->pop_count = pops;
5506		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
5507	}
5508
5509	return 0;
5510}
5511
5512static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
5513{
5514	switch(reason) {
5515	case FC_PUSH_VPM:
5516		ctx->bc->callstack[ctx->bc->call_sp].current--;
5517		break;
5518	case FC_PUSH_WQM:
5519	case FC_LOOP:
5520		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
5521		break;
5522	case FC_REP:
5523		/* TOODO : for 16 vp asic should -= 2; */
5524		ctx->bc->callstack[ctx->bc->call_sp].current --;
5525		break;
5526	}
5527}
5528
5529static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
5530{
5531	if (check_max_only) {
5532		int diff;
5533		switch (reason) {
5534		case FC_PUSH_VPM:
5535			diff = 1;
5536			break;
5537		case FC_PUSH_WQM:
5538			diff = 4;
5539			break;
5540		default:
5541			assert(0);
5542			diff = 0;
5543		}
5544		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
5545		    ctx->bc->callstack[ctx->bc->call_sp].max) {
5546			ctx->bc->callstack[ctx->bc->call_sp].max =
5547				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
5548		}
5549		return;
5550	}
5551	switch (reason) {
5552	case FC_PUSH_VPM:
5553		ctx->bc->callstack[ctx->bc->call_sp].current++;
5554		break;
5555	case FC_PUSH_WQM:
5556	case FC_LOOP:
5557		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
5558		break;
5559	case FC_REP:
5560		ctx->bc->callstack[ctx->bc->call_sp].current++;
5561		break;
5562	}
5563
5564	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
5565	    ctx->bc->callstack[ctx->bc->call_sp].max) {
5566		ctx->bc->callstack[ctx->bc->call_sp].max =
5567			ctx->bc->callstack[ctx->bc->call_sp].current;
5568	}
5569}
5570
5571static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
5572{
5573	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
5574
5575	sp->mid = realloc((void *)sp->mid,
5576						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5577	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5578	sp->num_mid++;
5579}
5580
5581static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5582{
5583	ctx->bc->fc_sp++;
5584	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5585	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5586}
5587
5588static void fc_poplevel(struct r600_shader_ctx *ctx)
5589{
5590	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5591	free(sp->mid);
5592	sp->mid = NULL;
5593	sp->num_mid = 0;
5594	sp->start = NULL;
5595	sp->type = 0;
5596	ctx->bc->fc_sp--;
5597}
5598
5599#if 0
5600static int emit_return(struct r600_shader_ctx *ctx)
5601{
5602	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5603	return 0;
5604}
5605
5606static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5607{
5608
5609	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5610	ctx->bc->cf_last->pop_count = pops;
5611	/* XXX work out offset */
5612	return 0;
5613}
5614
5615static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5616{
5617	return 0;
5618}
5619
5620static void emit_testflag(struct r600_shader_ctx *ctx)
5621{
5622
5623}
5624
5625static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5626{
5627	emit_testflag(ctx);
5628	emit_jump_to_offset(ctx, 1, 4);
5629	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5630	pops(ctx, ifidx + 1);
5631	emit_return(ctx);
5632}
5633
5634static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5635{
5636	emit_testflag(ctx);
5637
5638	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5639	ctx->bc->cf_last->pop_count = 1;
5640
5641	fc_set_mid(ctx, fc_sp);
5642
5643	pops(ctx, 1);
5644}
5645#endif
5646
5647static int tgsi_if(struct r600_shader_ctx *ctx)
5648{
5649	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5650
5651	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5652
5653	fc_pushlevel(ctx, FC_IF);
5654
5655	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5656	return 0;
5657}
5658
5659static int tgsi_else(struct r600_shader_ctx *ctx)
5660{
5661	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5662	ctx->bc->cf_last->pop_count = 1;
5663
5664	fc_set_mid(ctx, ctx->bc->fc_sp);
5665	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5666	return 0;
5667}
5668
5669static int tgsi_endif(struct r600_shader_ctx *ctx)
5670{
5671	pops(ctx, 1);
5672	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5673		R600_ERR("if/endif unbalanced in shader\n");
5674		return -1;
5675	}
5676
5677	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5678		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5679		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5680	} else {
5681		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5682	}
5683	fc_poplevel(ctx);
5684
5685	callstack_decrease_current(ctx, FC_PUSH_VPM);
5686	return 0;
5687}
5688
5689static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5690{
5691	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
5692	 * limited to 4096 iterations, like the other LOOP_* instructions. */
5693	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
5694
5695	fc_pushlevel(ctx, FC_LOOP);
5696
5697	/* check stack depth */
5698	callstack_check_depth(ctx, FC_LOOP, 0);
5699	return 0;
5700}
5701
5702static int tgsi_endloop(struct r600_shader_ctx *ctx)
5703{
5704	int i;
5705
5706	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5707
5708	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5709		R600_ERR("loop/endloop in shader code are not paired.\n");
5710		return -EINVAL;
5711	}
5712
5713	/* fixup loop pointers - from r600isa
5714	   LOOP END points to CF after LOOP START,
5715	   LOOP START point to CF after LOOP END
5716	   BRK/CONT point to LOOP END CF
5717	*/
5718	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5719
5720	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5721
5722	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5723		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5724	}
5725	/* XXX add LOOPRET support */
5726	fc_poplevel(ctx);
5727	callstack_decrease_current(ctx, FC_LOOP);
5728	return 0;
5729}
5730
5731static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5732{
5733	unsigned int fscp;
5734
5735	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5736	{
5737		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5738			break;
5739	}
5740
5741	if (fscp == 0) {
5742		R600_ERR("Break not inside loop/endloop pair\n");
5743		return -EINVAL;
5744	}
5745
5746	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5747
5748	fc_set_mid(ctx, fscp);
5749
5750	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5751	return 0;
5752}
5753
5754static int tgsi_umad(struct r600_shader_ctx *ctx)
5755{
5756	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5757	struct r600_bytecode_alu alu;
5758	int i, j, r;
5759	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5760
5761	/* src0 * src1 */
5762	for (i = 0; i < lasti + 1; i++) {
5763		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5764			continue;
5765
5766		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5767
5768		alu.dst.chan = i;
5769		alu.dst.sel = ctx->temp_reg;
5770		alu.dst.write = 1;
5771
5772		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5773		for (j = 0; j < 2; j++) {
5774		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5775		}
5776
5777		alu.last = 1;
5778		r = r600_bytecode_add_alu(ctx->bc, &alu);
5779		if (r)
5780			return r;
5781	}
5782
5783
5784	for (i = 0; i < lasti + 1; i++) {
5785		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5786			continue;
5787
5788		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5789		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5790
5791		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5792
5793		alu.src[0].sel = ctx->temp_reg;
5794		alu.src[0].chan = i;
5795
5796		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5797		if (i == lasti) {
5798			alu.last = 1;
5799		}
5800		r = r600_bytecode_add_alu(ctx->bc, &alu);
5801		if (r)
5802			return r;
5803	}
5804	return 0;
5805}
5806
5807static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5808	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5809	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5810	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5811
5812	/* XXX:
5813	 * For state trackers other than OpenGL, we'll want to use
5814	 * _RECIP_IEEE instead.
5815	 */
5816	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5817
5818	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5819	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5820	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5821	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5822	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5823	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5824	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5825	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5826	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5827	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5828	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5829	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5830	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5831	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5832	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5833	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5834	/* gap */
5835	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5836	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5837	/* gap */
5838	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5839	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5840	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5841	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5842	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5843	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5844	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5845	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5846	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5847	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5848	/* gap */
5849	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5850	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5851	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5852	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5853	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5854	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5855	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5856	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5857	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5858	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5859	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5860	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5861	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5862	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5863	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5864	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5865	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5866	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5867	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5868	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5869	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5870	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5871	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5872	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5873	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5874	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5875	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5876	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5877	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5878	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5879	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5880	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5881	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5882	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5883	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5884	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5885	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5886	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5887	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5888	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5889	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5890	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5891	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5892	/* gap */
5893	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5894	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5895	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5896	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5897	/* gap */
5898	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5899	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5900	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5901	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5902	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5903	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5904	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5905	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5906	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5907	/* gap */
5908	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5909	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5910	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5911	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5912	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5913	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5914	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5915	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5916	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5917	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5918	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5919	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5920	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5921	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5922	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5923	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5924	/* gap */
5925	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5926	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5927	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5928	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5929	/* gap */
5930	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5931	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5932	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5933	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5934	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5935	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5936	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5937	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5938	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5939	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5940	/* gap */
5941	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5942	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5943	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5944	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5945	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5946	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5947	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5948	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5949	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5950	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5951	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5952	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5953	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5954	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5955	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5956	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5957	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5958	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5959	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5960	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5961	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5962	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5963	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5964	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5965	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5966	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5967	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5968	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5969	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5970	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5971	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5972	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5973	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5974	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5975	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5976	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5977	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5978	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5979	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5980	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5981	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
5982	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5983	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5984	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5985	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5986	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5987	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5988	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5989	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5990	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5991	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5992	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5993	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5994	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5995	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5996	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5997	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5998	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5999	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6000	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6001	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6002	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6003	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6004};
6005
6006static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
6007	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6008	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6009	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
6010	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
6011	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
6012	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
6013	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
6014	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
6015	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6016	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6017	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6018	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
6019	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
6020	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
6021	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
6022	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
6023	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
6024	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6025	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
6026	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6027	/* gap */
6028	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6029	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6030	/* gap */
6031	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6032	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6033	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
6034	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6035	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
6036	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
6037	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
6038	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
6039	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
6040	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
6041	/* gap */
6042	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6043	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6044	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6045	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6046	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
6047	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
6048	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
6049	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
6050	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6051	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6052	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6053	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6054	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6055	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
6056	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6057	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
6058	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
6059	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
6060	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
6061	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6062	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6063	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
6064	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6065	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6066	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6067	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6068	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6069	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6070	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6071	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6072	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6073	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6074	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6075	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
6076	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
6077	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
6078	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6079	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6080	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6081	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6082	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6083	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
6084	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
6085	/* gap */
6086	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6087	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6088	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
6089	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
6090	/* gap */
6091	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6092	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6093	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6094	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6095	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
6096	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
6097	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
6098	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
6099	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
6100	/* gap */
6101	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6102	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
6103	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
6104	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
6105	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
6106	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6107	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
6108	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6109	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
6110	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6111	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6112	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
6113	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6114	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
6115	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6116	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6117	/* gap */
6118	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6119	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6120	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6121	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6122	/* gap */
6123	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6124	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6125	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6126	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6127	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6128	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6129	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6130	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6131	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
6132	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
6133	/* gap */
6134	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6135	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
6136	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
6137	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
6138	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
6139	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
6140	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
6141	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
6142	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
6143	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
6144	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
6145	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
6146	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
6147	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
6148	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
6149	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
6150	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
6151	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
6152	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
6153	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
6154	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
6155	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
6156	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
6157	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6158	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6159	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6160	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6161	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
6162	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
6163	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
6164	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
6165	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
6166	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6167	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
6168	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
6169	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
6170	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
6171	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6172	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6173	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
6174	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
6175	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
6176	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
6177	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6178	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6179	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6180	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6181	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6182	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6183	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6184	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6185	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6186	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6187	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6188	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6189	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6190	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6191	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6192	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6193	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6194	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6195	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6196	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6197};
6198
6199static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
6200	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6201	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6202	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
6203	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
6204	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
6205	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
6206	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
6207	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
6208	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6209	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6210	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6211	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
6212	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
6213	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
6214	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
6215	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
6216	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
6217	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6218	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
6219	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6220	/* gap */
6221	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6222	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6223	/* gap */
6224	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6225	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6226	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
6227	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6228	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
6229	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
6230	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
6231	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
6232	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
6233	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
6234	/* gap */
6235	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6236	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6237	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6238	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6239	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
6240	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
6241	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
6242	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
6243	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6244	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6245	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6246	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6247	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6248	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
6249	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6250	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
6251	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
6252	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
6253	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
6254	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6255	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6256	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
6257	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6258	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6259	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6260	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6261	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6262	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6263	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6264	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6265	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6266	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6267	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6268	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
6269	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
6270	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
6271	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6272	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6273	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6274	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6275	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6276	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
6277	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
6278	/* gap */
6279	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6280	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6281	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
6282	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
6283	/* gap */
6284	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6285	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6286	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6287	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6288	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
6289	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
6290	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
6291	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
6292	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
6293	/* gap */
6294	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6295	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
6296	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
6297	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
6298	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
6299	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6300	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
6301	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6302	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
6303	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6304	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6305	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
6306	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6307	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
6308	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6309	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6310	/* gap */
6311	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6312	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6313	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6314	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6315	/* gap */
6316	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6317	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6318	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6319	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6320	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6321	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6322	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6323	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6324	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
6325	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
6326	/* gap */
6327	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6328	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
6329	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
6330	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
6331	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
6332	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
6333	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
6334	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
6335	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
6336	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
6337	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
6338	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
6339	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
6340	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
6341	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
6342	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
6343	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
6344	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
6345	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
6346	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
6347	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
6348	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
6349	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
6350	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6351	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6352	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6353	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6354	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
6355	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
6356	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
6357	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
6358	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
6359	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6360	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
6361	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
6362	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
6363	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
6364	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6365	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6366	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
6367	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
6368	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
6369	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
6370	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6371	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6372	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6373	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6374	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6375	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6376	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6377	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6378	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6379	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6380	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6381	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6382	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6383	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6384	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6385	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6386	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6387	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6388	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6389	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6390};
6391