r600_shader.c revision d23aa650015ec017649f5a4ce8cb12d8c314bd3a
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "pipe/p_shader_tokens.h"
31#include "tgsi/tgsi_info.h"
32#include "tgsi/tgsi_parse.h"
33#include "tgsi/tgsi_scan.h"
34#include "tgsi/tgsi_dump.h"
35#include "util/u_memory.h"
36#include <stdio.h>
37#include <errno.h>
38#include <byteswap.h>
39
40/* CAYMAN notes
41Why CAYMAN got loops for lots of instructions is explained here.
42
43-These 8xx t-slot only ops are implemented in all vector slots.
44MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
45These 8xx t-slot only opcodes become vector ops, with all four
46slots expecting the arguments on sources a and b. Result is
47broadcast to all channels.
48MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
49These 8xx t-slot only opcodes become vector ops in the z, y, and
50x slots.
51EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
52RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
53SQRT_IEEE/_64
54SIN/COS
55The w slot may have an independent co-issued operation, or if the
56result is required to be in the w slot, the opcode above may be
57issued in the w slot as well.
58The compiler must issue the source argument to slots z, y, and x
59*/
60
61static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
62{
63	struct r600_context *rctx = (struct r600_context *)ctx;
64	struct r600_shader *rshader = &shader->shader;
65	uint32_t *ptr;
66	int	i;
67
68	/* copy new shader */
69	if (shader->bo == NULL) {
70		shader->bo = (struct r600_resource*)
71			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
72		if (shader->bo == NULL) {
73			return -ENOMEM;
74		}
75		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
76		if (R600_BIG_ENDIAN) {
77			for (i = 0; i < rshader->bc.ndw; ++i) {
78				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
79			}
80		} else {
81			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
82		}
83		rctx->ws->buffer_unmap(shader->bo->cs_buf);
84	}
85	/* build state */
86	switch (rshader->processor_type) {
87	case TGSI_PROCESSOR_VERTEX:
88		if (rctx->chip_class >= EVERGREEN) {
89			evergreen_pipe_shader_vs(ctx, shader);
90		} else {
91			r600_pipe_shader_vs(ctx, shader);
92		}
93		break;
94	case TGSI_PROCESSOR_FRAGMENT:
95		if (rctx->chip_class >= EVERGREEN) {
96			evergreen_pipe_shader_ps(ctx, shader);
97		} else {
98			r600_pipe_shader_ps(ctx, shader);
99		}
100		break;
101	default:
102		return -EINVAL;
103	}
104	return 0;
105}
106
107static int r600_shader_from_tgsi(struct r600_screen *rscreen,
108				 struct r600_pipe_shader *pipeshader,
109				 struct r600_shader_key key);
110
111static void r600_dump_streamout(struct pipe_stream_output_info *so)
112{
113	unsigned i;
114
115	fprintf(stderr, "STREAMOUT\n");
116	for (i = 0; i < so->num_outputs; i++) {
117		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
118				so->output[i].start_component;
119		fprintf(stderr, "  %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
120			i, so->output[i].output_buffer,
121			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
122			so->output[i].register_index,
123			mask & 1 ? "x" : "",
124		        mask & 2 ? "y" : "",
125		        mask & 4 ? "z" : "",
126		        mask & 8 ? "w" : "",
127			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
128	}
129}
130
131int r600_pipe_shader_create(struct pipe_context *ctx,
132			    struct r600_pipe_shader *shader,
133			    struct r600_shader_key key)
134{
135	static int dump_shaders = -1;
136	struct r600_context *rctx = (struct r600_context *)ctx;
137	struct r600_pipe_shader_selector *sel = shader->selector;
138	int r;
139
140	/* Would like some magic "get_bool_option_once" routine.
141	*/
142	if (dump_shaders == -1)
143		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
144
145	if (dump_shaders) {
146		fprintf(stderr, "--------------------------------------------------------------\n");
147		tgsi_dump(sel->tokens, 0);
148
149		if (sel->so.num_outputs) {
150			r600_dump_streamout(&sel->so);
151		}
152	}
153	r = r600_shader_from_tgsi(rctx->screen, shader, key);
154	if (r) {
155		R600_ERR("translation from TGSI failed !\n");
156		return r;
157	}
158	r = r600_bytecode_build(&shader->shader.bc);
159	if (r) {
160		R600_ERR("building bytecode failed !\n");
161		return r;
162	}
163	if (dump_shaders) {
164		r600_bytecode_dump(&shader->shader.bc);
165		fprintf(stderr, "______________________________________________________________\n");
166	}
167	return r600_pipe_shader(ctx, shader);
168}
169
170void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
171{
172	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
173	r600_bytecode_clear(&shader->shader.bc);
174}
175
176/*
177 * tgsi -> r600 shader
178 */
179struct r600_shader_tgsi_instruction;
180
181struct r600_shader_src {
182	unsigned				sel;
183	unsigned				swizzle[4];
184	unsigned				neg;
185	unsigned				abs;
186	unsigned				rel;
187	unsigned				kc_bank;
188	uint32_t				value[4];
189};
190
191struct r600_shader_ctx {
192	struct tgsi_shader_info			info;
193	struct tgsi_parse_context		parse;
194	const struct tgsi_token			*tokens;
195	unsigned				type;
196	unsigned				file_offset[TGSI_FILE_COUNT];
197	unsigned				temp_reg;
198	struct r600_shader_tgsi_instruction	*inst_info;
199	struct r600_bytecode			*bc;
200	struct r600_shader			*shader;
201	struct r600_shader_src			src[4];
202	uint32_t				*literals;
203	uint32_t				nliterals;
204	uint32_t				max_driver_temp_used;
205	boolean use_llvm;
206	/* needed for evergreen interpolation */
207	boolean                                 input_centroid;
208	boolean                                 input_linear;
209	boolean                                 input_perspective;
210	int					num_interp_gpr;
211	int					face_gpr;
212	int					colors_used;
213	boolean                 clip_vertex_write;
214	unsigned                cv_output;
215	int					fragcoord_input;
216	int					native_integers;
217};
218
219struct r600_shader_tgsi_instruction {
220	unsigned	tgsi_opcode;
221	unsigned	is_op3;
222	unsigned	r600_opcode;
223	int (*process)(struct r600_shader_ctx *ctx);
224};
225
226static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
227static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
228static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
229static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
230static int tgsi_else(struct r600_shader_ctx *ctx);
231static int tgsi_endif(struct r600_shader_ctx *ctx);
232static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
233static int tgsi_endloop(struct r600_shader_ctx *ctx);
234static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
235
236/*
237 * bytestream -> r600 shader
238 *
239 * These functions are used to transform the output of the LLVM backend into
240 * struct r600_bytecode.
241 */
242
243static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
244				unsigned char * bytes,	unsigned num_bytes);
245
246#ifdef HAVE_OPENCL
247int r600_compute_shader_create(struct pipe_context * ctx,
248	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
249{
250	struct r600_context *r600_ctx = (struct r600_context *)ctx;
251	unsigned char * bytes;
252	unsigned byte_count;
253	struct r600_shader_ctx shader_ctx;
254	unsigned dump = 0;
255
256	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
257		dump = 1;
258	}
259
260	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
261	shader_ctx.bc = bytecode;
262	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family,
263			   r600_ctx->screen->msaa_texture_support);
264	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
265	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
266	if (shader_ctx.bc->chip_class == CAYMAN) {
267		cm_bytecode_add_cf_end(shader_ctx.bc);
268	}
269	r600_bytecode_build(shader_ctx.bc);
270	if (dump) {
271		r600_bytecode_dump(shader_ctx.bc);
272	}
273	free(bytes);
274	return 1;
275}
276
277#endif /* HAVE_OPENCL */
278
279static uint32_t i32_from_byte_stream(unsigned char * bytes,
280		unsigned * bytes_read)
281{
282	unsigned i;
283	uint32_t out = 0;
284	for (i = 0; i < 4; i++) {
285		out |= bytes[(*bytes_read)++] << (8 * i);
286	}
287	return out;
288}
289
290static unsigned r600_src_from_byte_stream(unsigned char * bytes,
291		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
292{
293	unsigned i;
294	unsigned sel0, sel1;
295	sel0 = bytes[bytes_read++];
296	sel1 = bytes[bytes_read++];
297	alu->src[src_idx].sel = sel0 | (sel1 << 8);
298	alu->src[src_idx].chan = bytes[bytes_read++];
299	alu->src[src_idx].neg = bytes[bytes_read++];
300	alu->src[src_idx].abs = bytes[bytes_read++];
301	alu->src[src_idx].rel = bytes[bytes_read++];
302	alu->src[src_idx].kc_bank = bytes[bytes_read++];
303	for (i = 0; i < 4; i++) {
304		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
305	}
306	return bytes_read;
307}
308
309static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
310				unsigned char * bytes, unsigned bytes_read)
311{
312	unsigned src_idx;
313	struct r600_bytecode_alu alu;
314	unsigned src_const_reg[3];
315	uint32_t word0, word1;
316
317	memset(&alu, 0, sizeof(alu));
318	for(src_idx = 0; src_idx < 3; src_idx++) {
319		unsigned i;
320		src_const_reg[src_idx] = bytes[bytes_read++];
321		for (i = 0; i < 4; i++) {
322			alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
323		}
324	}
325
326	word0 = i32_from_byte_stream(bytes, &bytes_read);
327	word1 = i32_from_byte_stream(bytes, &bytes_read);
328
329	switch(ctx->bc->chip_class) {
330	default:
331	case R600:
332		r600_bytecode_alu_read(&alu, word0, word1);
333		break;
334	case R700:
335	case EVERGREEN:
336	case CAYMAN:
337		r700_bytecode_alu_read(&alu, word0, word1);
338		break;
339	}
340
341	for(src_idx = 0; src_idx < 3; src_idx++) {
342		if (src_const_reg[src_idx])
343			alu.src[src_idx].sel += 512;
344	}
345
346#if HAVE_LLVM < 0x0302
347	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
348	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
349	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
350	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
351		alu.update_pred = 1;
352		alu.dst.write = 0;
353		alu.src[1].sel = V_SQ_ALU_SRC_0;
354		alu.src[1].chan = 0;
355		alu.last = 1;
356	}
357#endif
358
359	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT)) {
360		ctx->bc->ar_reg = alu.src[0].sel;
361		ctx->bc->ar_loaded = 0;
362		return bytes_read;
363	}
364
365	if (alu.execute_mask) {
366		alu.pred_sel = 0;
367		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
368	} else {
369		r600_bytecode_add_alu(ctx->bc, &alu);
370	}
371
372	/* XXX: Handle other KILL instructions */
373	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
374		ctx->shader->uses_kill = 1;
375		/* XXX: This should be enforced in the LLVM backend. */
376		ctx->bc->force_add_cf = 1;
377	}
378	return bytes_read;
379}
380
381static void llvm_if(struct r600_shader_ctx *ctx)
382{
383	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
384	fc_pushlevel(ctx, FC_IF);
385	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
386}
387
388static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
389{
390	unsigned opcode = TGSI_OPCODE_BRK;
391	if (ctx->bc->chip_class == CAYMAN)
392		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
393	else if (ctx->bc->chip_class >= EVERGREEN)
394		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
395	else
396		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
397	llvm_if(ctx);
398	tgsi_loop_brk_cont(ctx);
399	tgsi_endif(ctx);
400}
401
402static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
403				unsigned char * bytes, unsigned bytes_read)
404{
405	struct r600_bytecode_alu alu;
406	unsigned inst;
407	memset(&alu, 0, sizeof(alu));
408	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
409	inst = bytes[bytes_read++];
410	switch (inst) {
411	case 0: /* IF_PREDICATED */
412		llvm_if(ctx);
413		break;
414	case 1: /* ELSE */
415		tgsi_else(ctx);
416		break;
417	case 2: /* ENDIF */
418		tgsi_endif(ctx);
419		break;
420	case 3: /* BGNLOOP */
421		tgsi_bgnloop(ctx);
422		break;
423	case 4: /* ENDLOOP */
424		tgsi_endloop(ctx);
425		break;
426	case 5: /* PREDICATED_BREAK */
427		r600_break_from_byte_stream(ctx);
428		break;
429	case 6: /* CONTINUE */
430		{
431			unsigned opcode = TGSI_OPCODE_CONT;
432			if (ctx->bc->chip_class == CAYMAN) {
433				ctx->inst_info =
434					&cm_shader_tgsi_instruction[opcode];
435			} else if (ctx->bc->chip_class >= EVERGREEN) {
436				ctx->inst_info =
437					&eg_shader_tgsi_instruction[opcode];
438			} else {
439				ctx->inst_info =
440					&r600_shader_tgsi_instruction[opcode];
441			}
442			tgsi_loop_brk_cont(ctx);
443		}
444		break;
445	}
446
447	return bytes_read;
448}
449
450static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
451				unsigned char * bytes, unsigned bytes_read)
452{
453	struct r600_bytecode_tex tex;
454
455	tex.inst = bytes[bytes_read++];
456	tex.resource_id = bytes[bytes_read++];
457	tex.src_gpr = bytes[bytes_read++];
458	tex.src_rel = bytes[bytes_read++];
459	tex.dst_gpr = bytes[bytes_read++];
460	tex.dst_rel = bytes[bytes_read++];
461	tex.dst_sel_x = bytes[bytes_read++];
462	tex.dst_sel_y = bytes[bytes_read++];
463	tex.dst_sel_z = bytes[bytes_read++];
464	tex.dst_sel_w = bytes[bytes_read++];
465	tex.lod_bias = bytes[bytes_read++];
466	tex.coord_type_x = bytes[bytes_read++];
467	tex.coord_type_y = bytes[bytes_read++];
468	tex.coord_type_z = bytes[bytes_read++];
469	tex.coord_type_w = bytes[bytes_read++];
470	tex.offset_x = bytes[bytes_read++];
471	tex.offset_y = bytes[bytes_read++];
472	tex.offset_z = bytes[bytes_read++];
473	tex.sampler_id = bytes[bytes_read++];
474	tex.src_sel_x = bytes[bytes_read++];
475	tex.src_sel_y = bytes[bytes_read++];
476	tex.src_sel_z = bytes[bytes_read++];
477	tex.src_sel_w = bytes[bytes_read++];
478
479	tex.inst_mod = 0;
480
481	r600_bytecode_add_tex(ctx->bc, &tex);
482
483	return bytes_read;
484}
485
486static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
487	unsigned char * bytes, unsigned bytes_read)
488{
489	struct r600_bytecode_vtx vtx;
490
491	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
492        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
493	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
494
495	memset(&vtx, 0, sizeof(vtx));
496
497	/* WORD0 */
498	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
499	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
500	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
501	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
502	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
503	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
504
505	/* WORD1 */
506	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
507	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
508	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
509	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
510	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
511	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
512	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
513	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
514	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
515	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
516
517	/* WORD 2*/
518	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
519	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
520
521	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
522		fprintf(stderr, "Error adding vtx\n");
523	}
524	/* Use the Texture Cache */
525	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
526	return bytes_read;
527}
528
529static int r600_export_from_byte_stream(struct r600_shader_ctx *ctx,
530	unsigned char * bytes, unsigned bytes_read)
531{
532	uint32_t word0 = 0, word1 = 0;
533	struct r600_bytecode_output output;
534	memset(&output, 0, sizeof(struct r600_bytecode_output));
535	word0 = i32_from_byte_stream(bytes, &bytes_read);
536	word1 = i32_from_byte_stream(bytes, &bytes_read);
537	if (ctx->bc->chip_class >= EVERGREEN)
538		eg_bytecode_export_read(&output, word0,word1);
539	else
540		r600_bytecode_export_read(&output, word0,word1);
541	r600_bytecode_add_output(ctx->bc, &output);
542	return bytes_read;
543}
544
545static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
546				unsigned char * bytes,	unsigned num_bytes)
547{
548	unsigned bytes_read = 0;
549	unsigned i, byte;
550	while (bytes_read < num_bytes) {
551		char inst_type = bytes[bytes_read++];
552		switch (inst_type) {
553		case 0:
554			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
555								bytes_read);
556			break;
557		case 1:
558			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
559								bytes_read);
560			break;
561		case 2:
562			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
563								bytes_read);
564			break;
565		case 3:
566			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
567			for (i = 0; i < 2; i++) {
568				for (byte = 0 ; byte < 4; byte++) {
569					ctx->bc->cf_last->isa[i] |=
570					(bytes[bytes_read++] << (byte * 8));
571				}
572			}
573			break;
574
575		case 4:
576			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
577								bytes_read);
578			break;
579		case 5:
580            bytes_read = r600_export_from_byte_stream(ctx, bytes,
581                                bytes_read);
582            break;
583		default:
584			/* XXX: Error here */
585			break;
586		}
587	}
588}
589
590/* End bytestream -> r600 shader functions*/
591
592static int tgsi_is_supported(struct r600_shader_ctx *ctx)
593{
594	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
595	int j;
596
597	if (i->Instruction.NumDstRegs > 1) {
598		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
599		return -EINVAL;
600	}
601	if (i->Instruction.Predicate) {
602		R600_ERR("predicate unsupported\n");
603		return -EINVAL;
604	}
605#if 0
606	if (i->Instruction.Label) {
607		R600_ERR("label unsupported\n");
608		return -EINVAL;
609	}
610#endif
611	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
612		if (i->Src[j].Register.Dimension) {
613		   if (i->Src[j].Register.File != TGSI_FILE_CONSTANT) {
614			   R600_ERR("unsupported src %d (dimension %d)\n", j,
615				    i->Src[j].Register.Dimension);
616			   return -EINVAL;
617		   }
618		}
619	}
620	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
621		if (i->Dst[j].Register.Dimension) {
622			R600_ERR("unsupported dst (dimension)\n");
623			return -EINVAL;
624		}
625	}
626	return 0;
627}
628
629static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
630{
631	int i, r;
632	struct r600_bytecode_alu alu;
633	int gpr = 0, base_chan = 0;
634	int ij_index = 0;
635
636	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
637		ij_index = 0;
638		if (ctx->shader->input[input].centroid)
639			ij_index++;
640	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
641		ij_index = 0;
642		/* if we have perspective add one */
643		if (ctx->input_perspective)  {
644			ij_index++;
645			/* if we have perspective centroid */
646			if (ctx->input_centroid)
647				ij_index++;
648		}
649		if (ctx->shader->input[input].centroid)
650			ij_index++;
651	}
652
653	/* work out gpr and base_chan from index */
654	gpr = ij_index / 2;
655	base_chan = (2 * (ij_index % 2)) + 1;
656
657	for (i = 0; i < 8; i++) {
658		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
659
660		if (i < 4)
661			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
662		else
663			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
664
665		if ((i > 1) && (i < 6)) {
666			alu.dst.sel = ctx->shader->input[input].gpr;
667			alu.dst.write = 1;
668		}
669
670		alu.dst.chan = i % 4;
671
672		alu.src[0].sel = gpr;
673		alu.src[0].chan = (base_chan - (i % 2));
674
675		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
676
677		alu.bank_swizzle_force = SQ_ALU_VEC_210;
678		if ((i % 4) == 3)
679			alu.last = 1;
680		r = r600_bytecode_add_alu(ctx->bc, &alu);
681		if (r)
682			return r;
683	}
684	return 0;
685}
686
687static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
688{
689	int i, r;
690	struct r600_bytecode_alu alu;
691
692	for (i = 0; i < 4; i++) {
693		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
694
695		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
696
697		alu.dst.sel = ctx->shader->input[input].gpr;
698		alu.dst.write = 1;
699
700		alu.dst.chan = i;
701
702		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
703		alu.src[0].chan = i;
704
705		if (i == 3)
706			alu.last = 1;
707		r = r600_bytecode_add_alu(ctx->bc, &alu);
708		if (r)
709			return r;
710	}
711	return 0;
712}
713
714/*
715 * Special export handling in shaders
716 *
717 * shader export ARRAY_BASE for EXPORT_POS:
718 * 60 is position
719 * 61 is misc vector
720 * 62, 63 are clip distance vectors
721 *
722 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
723 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
724 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
725 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
726 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
727 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
728 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
729 * exclusive from render target index)
730 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
731 *
732 *
733 * shader export ARRAY_BASE for EXPORT_PIXEL:
734 * 0-7 CB targets
735 * 61 computed Z vector
736 *
737 * The use of the values exported in the computed Z vector are controlled
738 * by DB_SHADER_CONTROL:
739 * Z_EXPORT_ENABLE - Z as a float in RED
740 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
741 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
742 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
743 * DB_SOURCE_FORMAT - export control restrictions
744 *
745 */
746
747
748/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
749static int r600_spi_sid(struct r600_shader_io * io)
750{
751	int index, name = io->name;
752
753	/* These params are handled differently, they don't need
754	 * semantic indices, so we'll use 0 for them.
755	 */
756	if (name == TGSI_SEMANTIC_POSITION ||
757		name == TGSI_SEMANTIC_PSIZE ||
758		name == TGSI_SEMANTIC_FACE)
759		index = 0;
760	else {
761		if (name == TGSI_SEMANTIC_GENERIC) {
762			/* For generic params simply use sid from tgsi */
763			index = io->sid;
764		} else {
765			/* For non-generic params - pack name and sid into 8 bits */
766			index = 0x80 | (name<<3) | (io->sid);
767		}
768
769		/* Make sure that all really used indices have nonzero value, so
770		 * we can just compare it to 0 later instead of comparing the name
771		 * with different values to detect special cases. */
772		index++;
773	}
774
775	return index;
776};
777
778/* turn input into interpolate on EG */
779static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
780{
781	int r = 0;
782
783	if (ctx->shader->input[index].spi_sid) {
784		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
785		if (!ctx->use_llvm) {
786			if (ctx->shader->input[index].interpolate > 0) {
787				r = evergreen_interp_alu(ctx, index);
788			} else {
789				r = evergreen_interp_flat(ctx, index);
790			}
791		}
792	}
793	return r;
794}
795
796static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
797{
798	struct r600_bytecode_alu alu;
799	int i, r;
800	int gpr_front = ctx->shader->input[front].gpr;
801	int gpr_back = ctx->shader->input[back].gpr;
802
803	for (i = 0; i < 4; i++) {
804		memset(&alu, 0, sizeof(alu));
805		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
806		alu.is_op3 = 1;
807		alu.dst.write = 1;
808		alu.dst.sel = gpr_front;
809		alu.src[0].sel = ctx->face_gpr;
810		alu.src[1].sel = gpr_front;
811		alu.src[2].sel = gpr_back;
812
813		alu.dst.chan = i;
814		alu.src[1].chan = i;
815		alu.src[2].chan = i;
816		alu.last = (i==3);
817
818		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
819			return r;
820	}
821
822	return 0;
823}
824
825static int tgsi_declaration(struct r600_shader_ctx *ctx)
826{
827	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
828	unsigned i;
829	int r;
830
831	switch (d->Declaration.File) {
832	case TGSI_FILE_INPUT:
833		i = ctx->shader->ninput++;
834		ctx->shader->input[i].name = d->Semantic.Name;
835		ctx->shader->input[i].sid = d->Semantic.Index;
836		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
837		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
838		ctx->shader->input[i].centroid = d->Interp.Centroid;
839		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
840		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
841			switch (ctx->shader->input[i].name) {
842			case TGSI_SEMANTIC_FACE:
843				ctx->face_gpr = ctx->shader->input[i].gpr;
844				break;
845			case TGSI_SEMANTIC_COLOR:
846				ctx->colors_used++;
847				break;
848			case TGSI_SEMANTIC_POSITION:
849				ctx->fragcoord_input = i;
850				break;
851			}
852			if (ctx->bc->chip_class >= EVERGREEN) {
853				if ((r = evergreen_interp_input(ctx, i)))
854					return r;
855			}
856		}
857		break;
858	case TGSI_FILE_OUTPUT:
859		i = ctx->shader->noutput++;
860		ctx->shader->output[i].name = d->Semantic.Name;
861		ctx->shader->output[i].sid = d->Semantic.Index;
862		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
863		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
864		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
865		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
866		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
867			switch (d->Semantic.Name) {
868			case TGSI_SEMANTIC_CLIPDIST:
869				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
870				break;
871			case TGSI_SEMANTIC_PSIZE:
872				ctx->shader->vs_out_misc_write = 1;
873				ctx->shader->vs_out_point_size = 1;
874				break;
875			case TGSI_SEMANTIC_CLIPVERTEX:
876				ctx->clip_vertex_write = TRUE;
877				ctx->cv_output = i;
878				break;
879			}
880		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
881			switch (d->Semantic.Name) {
882			case TGSI_SEMANTIC_COLOR:
883				ctx->shader->nr_ps_max_color_exports++;
884				break;
885			}
886		}
887		break;
888	case TGSI_FILE_CONSTANT:
889	case TGSI_FILE_TEMPORARY:
890	case TGSI_FILE_SAMPLER:
891	case TGSI_FILE_ADDRESS:
892		break;
893
894	case TGSI_FILE_SYSTEM_VALUE:
895		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
896			if (!ctx->native_integers) {
897				struct r600_bytecode_alu alu;
898				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
899
900				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
901				alu.src[0].sel = 0;
902				alu.src[0].chan = 3;
903
904				alu.dst.sel = 0;
905				alu.dst.chan = 3;
906				alu.dst.write = 1;
907				alu.last = 1;
908
909				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
910					return r;
911			}
912			break;
913		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
914			break;
915	default:
916		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
917		return -EINVAL;
918	}
919	return 0;
920}
921
922static int r600_get_temp(struct r600_shader_ctx *ctx)
923{
924	return ctx->temp_reg + ctx->max_driver_temp_used++;
925}
926
927/*
928 * for evergreen we need to scan the shader to find the number of GPRs we need to
929 * reserve for interpolation.
930 *
931 * we need to know if we are going to emit
932 * any centroid inputs
933 * if perspective and linear are required
934*/
935static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
936{
937	int i;
938	int num_baryc;
939
940	ctx->input_linear = FALSE;
941	ctx->input_perspective = FALSE;
942	ctx->input_centroid = FALSE;
943	ctx->num_interp_gpr = 1;
944
945	/* any centroid inputs */
946	for (i = 0; i < ctx->info.num_inputs; i++) {
947		/* skip position/face */
948		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
949		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
950			continue;
951		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
952			ctx->input_linear = TRUE;
953		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
954			ctx->input_perspective = TRUE;
955		if (ctx->info.input_centroid[i])
956			ctx->input_centroid = TRUE;
957	}
958
959	num_baryc = 0;
960	/* ignoring sample for now */
961	if (ctx->input_perspective)
962		num_baryc++;
963	if (ctx->input_linear)
964		num_baryc++;
965	if (ctx->input_centroid)
966		num_baryc *= 2;
967
968	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
969
970	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
971	return ctx->num_interp_gpr;
972}
973
974static void tgsi_src(struct r600_shader_ctx *ctx,
975		     const struct tgsi_full_src_register *tgsi_src,
976		     struct r600_shader_src *r600_src)
977{
978	memset(r600_src, 0, sizeof(*r600_src));
979	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
980	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
981	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
982	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
983	r600_src->neg = tgsi_src->Register.Negate;
984	r600_src->abs = tgsi_src->Register.Absolute;
985
986	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
987		int index;
988		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
989			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
990			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
991
992			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
993			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
994			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
995				return;
996		}
997		index = tgsi_src->Register.Index;
998		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
999		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1000	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1001		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1002			r600_src->swizzle[0] = 3;
1003			r600_src->swizzle[1] = 3;
1004			r600_src->swizzle[2] = 3;
1005			r600_src->swizzle[3] = 3;
1006			r600_src->sel = 0;
1007		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1008			r600_src->swizzle[0] = 0;
1009			r600_src->swizzle[1] = 0;
1010			r600_src->swizzle[2] = 0;
1011			r600_src->swizzle[3] = 0;
1012			r600_src->sel = 0;
1013		}
1014	} else {
1015		if (tgsi_src->Register.Indirect)
1016			r600_src->rel = V_SQ_REL_RELATIVE;
1017		r600_src->sel = tgsi_src->Register.Index;
1018		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1019	}
1020	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1021		if (tgsi_src->Register.Dimension) {
1022			r600_src->kc_bank = tgsi_src->Dimension.Index;
1023		}
1024	}
1025}
1026
1027static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int cb_idx, unsigned int offset, unsigned int dst_reg)
1028{
1029	struct r600_bytecode_vtx vtx;
1030	unsigned int ar_reg;
1031	int r;
1032
1033	if (offset) {
1034		struct r600_bytecode_alu alu;
1035
1036		memset(&alu, 0, sizeof(alu));
1037
1038		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
1039		alu.src[0].sel = ctx->bc->ar_reg;
1040
1041		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1042		alu.src[1].value = offset;
1043
1044		alu.dst.sel = dst_reg;
1045		alu.dst.write = 1;
1046		alu.last = 1;
1047
1048		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1049			return r;
1050
1051		ar_reg = dst_reg;
1052	} else {
1053		ar_reg = ctx->bc->ar_reg;
1054	}
1055
1056	memset(&vtx, 0, sizeof(vtx));
1057	vtx.buffer_id = cb_idx;
1058	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1059	vtx.src_gpr = ar_reg;
1060	vtx.mega_fetch_count = 16;
1061	vtx.dst_gpr = dst_reg;
1062	vtx.dst_sel_x = 0;		/* SEL_X */
1063	vtx.dst_sel_y = 1;		/* SEL_Y */
1064	vtx.dst_sel_z = 2;		/* SEL_Z */
1065	vtx.dst_sel_w = 3;		/* SEL_W */
1066	vtx.data_format = FMT_32_32_32_32_FLOAT;
1067	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1068	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1069	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1070	vtx.endian = r600_endian_swap(32);
1071
1072	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1073		return r;
1074
1075	return 0;
1076}
1077
1078static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1079{
1080	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1081	struct r600_bytecode_alu alu;
1082	int i, j, k, nconst, r;
1083
1084	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1085		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1086			nconst++;
1087		}
1088		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1089	}
1090	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1091		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1092			continue;
1093		}
1094
1095		if (ctx->src[i].rel) {
1096			int treg = r600_get_temp(ctx);
1097			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, treg)))
1098				return r;
1099
1100			ctx->src[i].kc_bank = 0;
1101			ctx->src[i].sel = treg;
1102			ctx->src[i].rel = 0;
1103			j--;
1104		} else if (j > 0) {
1105			int treg = r600_get_temp(ctx);
1106			for (k = 0; k < 4; k++) {
1107				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1108				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1109				alu.src[0].sel = ctx->src[i].sel;
1110				alu.src[0].chan = k;
1111				alu.src[0].rel = ctx->src[i].rel;
1112				alu.dst.sel = treg;
1113				alu.dst.chan = k;
1114				alu.dst.write = 1;
1115				if (k == 3)
1116					alu.last = 1;
1117				r = r600_bytecode_add_alu(ctx->bc, &alu);
1118				if (r)
1119					return r;
1120			}
1121			ctx->src[i].sel = treg;
1122			ctx->src[i].rel =0;
1123			j--;
1124		}
1125	}
1126	return 0;
1127}
1128
1129/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1130static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1131{
1132	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1133	struct r600_bytecode_alu alu;
1134	int i, j, k, nliteral, r;
1135
1136	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1137		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1138			nliteral++;
1139		}
1140	}
1141	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1142		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1143			int treg = r600_get_temp(ctx);
1144			for (k = 0; k < 4; k++) {
1145				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1146				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1147				alu.src[0].sel = ctx->src[i].sel;
1148				alu.src[0].chan = k;
1149				alu.src[0].value = ctx->src[i].value[k];
1150				alu.dst.sel = treg;
1151				alu.dst.chan = k;
1152				alu.dst.write = 1;
1153				if (k == 3)
1154					alu.last = 1;
1155				r = r600_bytecode_add_alu(ctx->bc, &alu);
1156				if (r)
1157					return r;
1158			}
1159			ctx->src[i].sel = treg;
1160			j--;
1161		}
1162	}
1163	return 0;
1164}
1165
1166static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1167{
1168	int i, r, count = ctx->shader->ninput;
1169
1170	for (i = 0; i < count; i++) {
1171		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1172			unsigned back_facing_reg = ctx->shader->input[i].potential_back_facing_reg;
1173			if (ctx->bc->chip_class >= EVERGREEN) {
1174				if ((r = evergreen_interp_input(ctx, back_facing_reg)))
1175					return r;
1176			}
1177
1178			if (!ctx->use_llvm) {
1179				r = select_twoside_color(ctx, i, back_facing_reg);
1180				if (r)
1181					return r;
1182			}
1183		}
1184	}
1185	return 0;
1186}
1187
1188static int r600_shader_from_tgsi(struct r600_screen *rscreen,
1189				 struct r600_pipe_shader *pipeshader,
1190				 struct r600_shader_key key)
1191{
1192	struct r600_shader *shader = &pipeshader->shader;
1193	struct tgsi_token *tokens = pipeshader->selector->tokens;
1194	struct pipe_stream_output_info so = pipeshader->selector->so;
1195	struct tgsi_full_immediate *immediate;
1196	struct tgsi_full_property *property;
1197	struct r600_shader_ctx ctx;
1198	struct r600_bytecode_output output[32];
1199	unsigned output_done, noutput;
1200	unsigned opcode;
1201	int i, j, k, r = 0;
1202	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1203	/* Declarations used by llvm code */
1204	bool use_llvm = false;
1205	unsigned char * inst_bytes = NULL;
1206	unsigned inst_byte_count = 0;
1207
1208#ifdef R600_USE_LLVM
1209	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1210#endif
1211	ctx.bc = &shader->bc;
1212	ctx.shader = shader;
1213	ctx.native_integers = true;
1214
1215	r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family,
1216			   rscreen->msaa_texture_support);
1217	ctx.tokens = tokens;
1218	tgsi_scan_shader(tokens, &ctx.info);
1219	tgsi_parse_init(&ctx.parse, tokens);
1220	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1221	shader->processor_type = ctx.type;
1222	ctx.bc->type = shader->processor_type;
1223
1224	ctx.face_gpr = -1;
1225	ctx.fragcoord_input = -1;
1226	ctx.colors_used = 0;
1227	ctx.clip_vertex_write = 0;
1228
1229	shader->nr_ps_color_exports = 0;
1230	shader->nr_ps_max_color_exports = 0;
1231
1232	shader->two_side = key.color_two_side;
1233
1234	/* register allocations */
1235	/* Values [0,127] correspond to GPR[0..127].
1236	 * Values [128,159] correspond to constant buffer bank 0
1237	 * Values [160,191] correspond to constant buffer bank 1
1238	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1239	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1240	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1241	 * Other special values are shown in the list below.
1242	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1243	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1244	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1245	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1246	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1247	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1248	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1249	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1250	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1251	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1252	 * 254	SQ_ALU_SRC_PV: previous vector result.
1253	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1254	 */
1255	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1256		ctx.file_offset[i] = 0;
1257	}
1258	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1259		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1260		if (ctx.bc->chip_class >= EVERGREEN) {
1261			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1262		} else {
1263			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1264		}
1265	}
1266	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1267		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1268	}
1269
1270#ifdef R600_USE_LLVM
1271	if (use_llvm && ctx.info.indirect_files) {
1272		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1273				"indirect adressing.  Falling back to TGSI "
1274				"backend.\n");
1275		use_llvm = 0;
1276	}
1277#endif
1278	ctx.use_llvm = use_llvm;
1279
1280	if (use_llvm) {
1281		ctx.file_offset[TGSI_FILE_OUTPUT] =
1282			ctx.file_offset[TGSI_FILE_INPUT];
1283	} else {
1284	   ctx.file_offset[TGSI_FILE_OUTPUT] =
1285			ctx.file_offset[TGSI_FILE_INPUT] +
1286			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1287	}
1288	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1289						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1290
1291	/* Outside the GPR range. This will be translated to one of the
1292	 * kcache banks later. */
1293	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1294
1295	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1296	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1297			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1298	ctx.temp_reg = ctx.bc->ar_reg + 1;
1299
1300	ctx.nliterals = 0;
1301	ctx.literals = NULL;
1302	shader->fs_write_all = FALSE;
1303	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1304		tgsi_parse_token(&ctx.parse);
1305		switch (ctx.parse.FullToken.Token.Type) {
1306		case TGSI_TOKEN_TYPE_IMMEDIATE:
1307			immediate = &ctx.parse.FullToken.FullImmediate;
1308			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1309			if(ctx.literals == NULL) {
1310				r = -ENOMEM;
1311				goto out_err;
1312			}
1313			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1314			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1315			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1316			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1317			ctx.nliterals++;
1318			break;
1319		case TGSI_TOKEN_TYPE_DECLARATION:
1320			r = tgsi_declaration(&ctx);
1321			if (r)
1322				goto out_err;
1323			break;
1324		case TGSI_TOKEN_TYPE_INSTRUCTION:
1325			break;
1326		case TGSI_TOKEN_TYPE_PROPERTY:
1327			property = &ctx.parse.FullToken.FullProperty;
1328			switch (property->Property.PropertyName) {
1329			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1330				if (property->u[0].Data == 1)
1331					shader->fs_write_all = TRUE;
1332				break;
1333			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1334				/* we don't need this one */
1335				break;
1336			}
1337			break;
1338		default:
1339			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1340			r = -EINVAL;
1341			goto out_err;
1342		}
1343	}
1344
1345	/* Process two side if needed */
1346	if (shader->two_side && ctx.colors_used) {
1347		int i, count = ctx.shader->ninput;
1348		unsigned next_lds_loc = ctx.shader->nlds;
1349
1350		/* additional inputs will be allocated right after the existing inputs,
1351		 * we won't need them after the color selection, so we don't need to
1352		 * reserve these gprs for the rest of the shader code and to adjust
1353		 * output offsets etc. */
1354		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1355				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1356
1357		if (ctx.face_gpr == -1) {
1358			i = ctx.shader->ninput++;
1359			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1360			ctx.shader->input[i].spi_sid = 0;
1361			ctx.shader->input[i].gpr = gpr++;
1362			ctx.face_gpr = ctx.shader->input[i].gpr;
1363		}
1364
1365		for (i = 0; i < count; i++) {
1366			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1367				int ni = ctx.shader->ninput++;
1368				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1369				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1370				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1371				ctx.shader->input[ni].gpr = gpr++;
1372				// TGSI to LLVM needs to know the lds position of inputs.
1373				// Non LLVM path computes it later (in process_twoside_color)
1374				ctx.shader->input[ni].lds_pos = next_lds_loc++;
1375				ctx.shader->input[i].potential_back_facing_reg = ni;
1376			}
1377		}
1378	}
1379
1380/* LLVM backend setup */
1381#ifdef R600_USE_LLVM
1382	if (use_llvm) {
1383		struct radeon_llvm_context radeon_llvm_ctx;
1384		LLVMModuleRef mod;
1385		unsigned dump = 0;
1386		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1387		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1388		radeon_llvm_ctx.type = ctx.type;
1389		radeon_llvm_ctx.two_side = shader->two_side;
1390		radeon_llvm_ctx.face_input = ctx.face_gpr;
1391		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1392		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
1393		radeon_llvm_ctx.color_buffer_count = MAX2(key.nr_cbufs , 1);
1394		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1395		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->chip_class >= EVERGREEN);
1396		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1397		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1398			dump = 1;
1399		}
1400		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1401							rscreen->family, dump)) {
1402			FREE(inst_bytes);
1403			radeon_llvm_dispose(&radeon_llvm_ctx);
1404			use_llvm = 0;
1405			fprintf(stderr, "R600 LLVM backend failed to compile "
1406				"shader.  Falling back to TGSI\n");
1407		} else {
1408			ctx.file_offset[TGSI_FILE_OUTPUT] =
1409					ctx.file_offset[TGSI_FILE_INPUT];
1410		}
1411		radeon_llvm_dispose(&radeon_llvm_ctx);
1412	}
1413#endif
1414/* End of LLVM backend setup */
1415
1416	if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN)
1417		shader->nr_ps_max_color_exports = 8;
1418
1419	if (ctx.fragcoord_input >= 0 && !use_llvm) {
1420		if (ctx.bc->chip_class == CAYMAN) {
1421			for (j = 0 ; j < 4; j++) {
1422				struct r600_bytecode_alu alu;
1423				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1424				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1425				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1426				alu.src[0].chan = 3;
1427
1428				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1429				alu.dst.chan = j;
1430				alu.dst.write = (j == 3);
1431				alu.last = 1;
1432				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1433					return r;
1434			}
1435		} else {
1436			struct r600_bytecode_alu alu;
1437			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1438			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1439			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1440			alu.src[0].chan = 3;
1441
1442			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1443			alu.dst.chan = 3;
1444			alu.dst.write = 1;
1445			alu.last = 1;
1446			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1447				return r;
1448		}
1449	}
1450
1451	if (shader->two_side && ctx.colors_used) {
1452		if ((r = process_twoside_color_inputs(&ctx)))
1453			return r;
1454	}
1455
1456	tgsi_parse_init(&ctx.parse, tokens);
1457	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1458		tgsi_parse_token(&ctx.parse);
1459		switch (ctx.parse.FullToken.Token.Type) {
1460		case TGSI_TOKEN_TYPE_INSTRUCTION:
1461			if (use_llvm) {
1462				continue;
1463			}
1464			r = tgsi_is_supported(&ctx);
1465			if (r)
1466				goto out_err;
1467			ctx.max_driver_temp_used = 0;
1468			/* reserve first tmp for everyone */
1469			r600_get_temp(&ctx);
1470
1471			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1472			if ((r = tgsi_split_constant(&ctx)))
1473				goto out_err;
1474			if ((r = tgsi_split_literal_constant(&ctx)))
1475				goto out_err;
1476			if (ctx.bc->chip_class == CAYMAN)
1477				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1478			else if (ctx.bc->chip_class >= EVERGREEN)
1479				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1480			else
1481				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1482			r = ctx.inst_info->process(&ctx);
1483			if (r)
1484				goto out_err;
1485			break;
1486		default:
1487			break;
1488		}
1489	}
1490
1491	/* Reset the temporary register counter. */
1492	ctx.max_driver_temp_used = 0;
1493
1494	/* Get instructions if we are using the LLVM backend. */
1495	if (use_llvm) {
1496		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1497		FREE(inst_bytes);
1498	}
1499
1500	noutput = shader->noutput;
1501
1502	if (ctx.clip_vertex_write) {
1503		unsigned clipdist_temp[2];
1504
1505		clipdist_temp[0] = r600_get_temp(&ctx);
1506		clipdist_temp[1] = r600_get_temp(&ctx);
1507
1508		/* need to convert a clipvertex write into clipdistance writes and not export
1509		   the clip vertex anymore */
1510
1511		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1512		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1513		shader->output[noutput].gpr = clipdist_temp[0];
1514		noutput++;
1515		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1516		shader->output[noutput].gpr = clipdist_temp[1];
1517		noutput++;
1518
1519		/* reset spi_sid for clipvertex output to avoid confusing spi */
1520		shader->output[ctx.cv_output].spi_sid = 0;
1521
1522		shader->clip_dist_write = 0xFF;
1523
1524		for (i = 0; i < 8; i++) {
1525			int oreg = i >> 2;
1526			int ochan = i & 3;
1527
1528			for (j = 0; j < 4; j++) {
1529				struct r600_bytecode_alu alu;
1530				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1531				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1532				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1533				alu.src[0].chan = j;
1534
1535				alu.src[1].sel = 512 + i;
1536				alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
1537				alu.src[1].chan = j;
1538
1539				alu.dst.sel = clipdist_temp[oreg];
1540				alu.dst.chan = j;
1541				alu.dst.write = (j == ochan);
1542				if (j == 3)
1543					alu.last = 1;
1544				r = r600_bytecode_add_alu(ctx.bc, &alu);
1545				if (r)
1546					return r;
1547			}
1548		}
1549	}
1550
1551	/* Add stream outputs. */
1552	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1553		unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1554
1555		/* Sanity checking. */
1556		if (so.num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1557			R600_ERR("Too many stream outputs: %d\n", so.num_outputs);
1558			r = -EINVAL;
1559			goto out_err;
1560		}
1561		for (i = 0; i < so.num_outputs; i++) {
1562			if (so.output[i].output_buffer >= 4) {
1563				R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1564					 so.output[i].output_buffer);
1565				r = -EINVAL;
1566				goto out_err;
1567			}
1568		}
1569
1570		/* Initialize locations where the outputs are stored. */
1571		for (i = 0; i < so.num_outputs; i++) {
1572			so_gpr[i] = shader->output[so.output[i].register_index].gpr;
1573
1574			/* Lower outputs with dst_offset < start_component.
1575			 *
1576			 * We can only output 4D vectors with a write mask, e.g. we can
1577			 * only output the W component at offset 3, etc. If we want
1578			 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1579			 * to move it to X and output X. */
1580			if (so.output[i].dst_offset < so.output[i].start_component) {
1581				unsigned tmp = r600_get_temp(&ctx);
1582
1583				for (j = 0; j < so.output[i].num_components; j++) {
1584					struct r600_bytecode_alu alu;
1585					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1586					alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1587					alu.src[0].sel = so_gpr[i];
1588					alu.src[0].chan = so.output[i].start_component + j;
1589
1590					alu.dst.sel = tmp;
1591					alu.dst.chan = j;
1592					alu.dst.write = 1;
1593					if (j == so.output[i].num_components - 1)
1594						alu.last = 1;
1595					r = r600_bytecode_add_alu(ctx.bc, &alu);
1596					if (r)
1597						return r;
1598				}
1599				so.output[i].start_component = 0;
1600				so_gpr[i] = tmp;
1601			}
1602		}
1603
1604		/* Write outputs to buffers. */
1605		for (i = 0; i < so.num_outputs; i++) {
1606			struct r600_bytecode_output output;
1607
1608			memset(&output, 0, sizeof(struct r600_bytecode_output));
1609			output.gpr = so_gpr[i];
1610			output.elem_size = so.output[i].num_components;
1611			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1612			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1613			output.burst_count = 1;
1614			output.barrier = 1;
1615			/* array_size is an upper limit for the burst_count
1616			 * with MEM_STREAM instructions */
1617			output.array_size = 0xFFF;
1618			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1619			if (ctx.bc->chip_class >= EVERGREEN) {
1620				switch (so.output[i].output_buffer) {
1621				case 0:
1622					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1623					break;
1624				case 1:
1625					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1626					break;
1627				case 2:
1628					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1629					break;
1630				case 3:
1631					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1632					break;
1633				}
1634			} else {
1635				switch (so.output[i].output_buffer) {
1636				case 0:
1637					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1638					break;
1639				case 1:
1640					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1641					break;
1642				case 2:
1643					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1644					break;
1645				case 3:
1646					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1647					break;
1648				}
1649			}
1650			r = r600_bytecode_add_output(ctx.bc, &output);
1651			if (r)
1652				goto out_err;
1653		}
1654	}
1655
1656	/* export output */
1657	for (i = 0, j = 0; i < noutput; i++, j++) {
1658		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1659		output[j].gpr = shader->output[i].gpr;
1660		output[j].elem_size = 3;
1661		output[j].swizzle_x = 0;
1662		output[j].swizzle_y = 1;
1663		output[j].swizzle_z = 2;
1664		output[j].swizzle_w = 3;
1665		output[j].burst_count = 1;
1666		output[j].barrier = 1;
1667		output[j].type = -1;
1668		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1669		switch (ctx.type) {
1670		case TGSI_PROCESSOR_VERTEX:
1671			switch (shader->output[i].name) {
1672			case TGSI_SEMANTIC_POSITION:
1673				output[j].array_base = next_pos_base++;
1674				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1675				break;
1676
1677			case TGSI_SEMANTIC_PSIZE:
1678				output[j].array_base = next_pos_base++;
1679				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1680				break;
1681			case TGSI_SEMANTIC_CLIPVERTEX:
1682				j--;
1683				break;
1684			case TGSI_SEMANTIC_CLIPDIST:
1685				output[j].array_base = next_pos_base++;
1686				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1687				/* spi_sid is 0 for clipdistance outputs that were generated
1688				 * for clipvertex - we don't need to pass them to PS */
1689				if (shader->output[i].spi_sid) {
1690					j++;
1691					/* duplicate it as PARAM to pass to the pixel shader */
1692					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1693					output[j].array_base = next_param_base++;
1694					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1695				}
1696				break;
1697			case TGSI_SEMANTIC_FOG:
1698				output[j].swizzle_y = 4; /* 0 */
1699				output[j].swizzle_z = 4; /* 0 */
1700				output[j].swizzle_w = 5; /* 1 */
1701				break;
1702			}
1703			break;
1704		case TGSI_PROCESSOR_FRAGMENT:
1705			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1706				/* never export more colors than the number of CBs */
1707				if (next_pixel_base && next_pixel_base >= key.nr_cbufs) {
1708					/* skip export */
1709					j--;
1710					continue;
1711				}
1712				output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1713				output[j].array_base = next_pixel_base++;
1714				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1715				shader->nr_ps_color_exports++;
1716				if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
1717					for (k = 1; k < key.nr_cbufs; k++) {
1718						j++;
1719						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1720						output[j].gpr = shader->output[i].gpr;
1721						output[j].elem_size = 3;
1722						output[j].swizzle_x = 0;
1723						output[j].swizzle_y = 1;
1724						output[j].swizzle_z = 2;
1725						output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
1726						output[j].burst_count = 1;
1727						output[j].barrier = 1;
1728						output[j].array_base = next_pixel_base++;
1729						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1730						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1731						shader->nr_ps_color_exports++;
1732					}
1733				}
1734			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1735				output[j].array_base = 61;
1736				output[j].swizzle_x = 2;
1737				output[j].swizzle_y = 7;
1738				output[j].swizzle_z = output[j].swizzle_w = 7;
1739				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1740			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1741				output[j].array_base = 61;
1742				output[j].swizzle_x = 7;
1743				output[j].swizzle_y = 1;
1744				output[j].swizzle_z = output[j].swizzle_w = 7;
1745				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1746			} else {
1747				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1748				r = -EINVAL;
1749				goto out_err;
1750			}
1751			break;
1752		default:
1753			R600_ERR("unsupported processor type %d\n", ctx.type);
1754			r = -EINVAL;
1755			goto out_err;
1756		}
1757
1758		if (output[j].type==-1) {
1759			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1760			output[j].array_base = next_param_base++;
1761		}
1762	}
1763
1764        /* add fake position export */
1765	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_pos_base == 60) {
1766			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1767			output[j].gpr = 0;
1768			output[j].elem_size = 3;
1769			output[j].swizzle_x = 7;
1770			output[j].swizzle_y = 7;
1771			output[j].swizzle_z = 7;
1772			output[j].swizzle_w = 7;
1773			output[j].burst_count = 1;
1774			output[j].barrier = 1;
1775			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1776			output[j].array_base = next_pos_base;
1777			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1778			j++;
1779	}
1780
1781	/* add fake param output for vertex shader if no param is exported */
1782	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1783			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1784			output[j].gpr = 0;
1785			output[j].elem_size = 3;
1786			output[j].swizzle_x = 7;
1787			output[j].swizzle_y = 7;
1788			output[j].swizzle_z = 7;
1789			output[j].swizzle_w = 7;
1790			output[j].burst_count = 1;
1791			output[j].barrier = 1;
1792			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1793			output[j].array_base = 0;
1794			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1795			j++;
1796	}
1797
1798	/* add fake pixel export */
1799	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1800		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1801		output[j].gpr = 0;
1802		output[j].elem_size = 3;
1803		output[j].swizzle_x = 7;
1804		output[j].swizzle_y = 7;
1805		output[j].swizzle_z = 7;
1806		output[j].swizzle_w = 7;
1807		output[j].burst_count = 1;
1808		output[j].barrier = 1;
1809		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1810		output[j].array_base = 0;
1811		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1812		j++;
1813	}
1814
1815	noutput = j;
1816
1817	/* set export done on last export of each type */
1818	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1819		if (ctx.bc->chip_class < CAYMAN) {
1820			if (i == (noutput - 1)) {
1821				output[i].end_of_program = 1;
1822			}
1823		}
1824		if (!(output_done & (1 << output[i].type))) {
1825			output_done |= (1 << output[i].type);
1826			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1827		}
1828	}
1829	/* add output to bytecode */
1830	if (!use_llvm || ctx.type != TGSI_PROCESSOR_FRAGMENT) {
1831		for (i = 0; i < noutput; i++) {
1832			r = r600_bytecode_add_output(ctx.bc, &output[i]);
1833			if (r)
1834				goto out_err;
1835		}
1836	}
1837	/* add program end */
1838	if (ctx.bc->chip_class == CAYMAN)
1839		cm_bytecode_add_cf_end(ctx.bc);
1840
1841	/* check GPR limit - we have 124 = 128 - 4
1842	 * (4 are reserved as alu clause temporary registers) */
1843	if (ctx.bc->ngpr > 124) {
1844		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1845		r = -ENOMEM;
1846		goto out_err;
1847	}
1848
1849	free(ctx.literals);
1850	tgsi_parse_free(&ctx.parse);
1851	return 0;
1852out_err:
1853	free(ctx.literals);
1854	tgsi_parse_free(&ctx.parse);
1855	return r;
1856}
1857
1858static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1859{
1860	R600_ERR("%s tgsi opcode unsupported\n",
1861		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1862	return -EINVAL;
1863}
1864
1865static int tgsi_end(struct r600_shader_ctx *ctx)
1866{
1867	return 0;
1868}
1869
1870static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1871			const struct r600_shader_src *shader_src,
1872			unsigned chan)
1873{
1874	bc_src->sel = shader_src->sel;
1875	bc_src->chan = shader_src->swizzle[chan];
1876	bc_src->neg = shader_src->neg;
1877	bc_src->abs = shader_src->abs;
1878	bc_src->rel = shader_src->rel;
1879	bc_src->value = shader_src->value[bc_src->chan];
1880	bc_src->kc_bank = shader_src->kc_bank;
1881}
1882
1883static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1884{
1885	bc_src->abs = 1;
1886	bc_src->neg = 0;
1887}
1888
1889static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1890{
1891	bc_src->neg = !bc_src->neg;
1892}
1893
1894static void tgsi_dst(struct r600_shader_ctx *ctx,
1895		     const struct tgsi_full_dst_register *tgsi_dst,
1896		     unsigned swizzle,
1897		     struct r600_bytecode_alu_dst *r600_dst)
1898{
1899	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1900
1901	r600_dst->sel = tgsi_dst->Register.Index;
1902	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1903	r600_dst->chan = swizzle;
1904	r600_dst->write = 1;
1905	if (tgsi_dst->Register.Indirect)
1906		r600_dst->rel = V_SQ_REL_RELATIVE;
1907	if (inst->Instruction.Saturate) {
1908		r600_dst->clamp = 1;
1909	}
1910}
1911
1912static int tgsi_last_instruction(unsigned writemask)
1913{
1914	int i, lasti = 0;
1915
1916	for (i = 0; i < 4; i++) {
1917		if (writemask & (1 << i)) {
1918			lasti = i;
1919		}
1920	}
1921	return lasti;
1922}
1923
1924static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1925{
1926	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1927	struct r600_bytecode_alu alu;
1928	int i, j, r;
1929	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1930
1931	for (i = 0; i < lasti + 1; i++) {
1932		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1933			continue;
1934
1935		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1936		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1937
1938		alu.inst = ctx->inst_info->r600_opcode;
1939		if (!swap) {
1940			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1941				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1942			}
1943		} else {
1944			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1945			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1946		}
1947		/* handle some special cases */
1948		switch (ctx->inst_info->tgsi_opcode) {
1949		case TGSI_OPCODE_SUB:
1950			r600_bytecode_src_toggle_neg(&alu.src[1]);
1951			break;
1952		case TGSI_OPCODE_ABS:
1953			r600_bytecode_src_set_abs(&alu.src[0]);
1954			break;
1955		default:
1956			break;
1957		}
1958		if (i == lasti || trans_only) {
1959			alu.last = 1;
1960		}
1961		r = r600_bytecode_add_alu(ctx->bc, &alu);
1962		if (r)
1963			return r;
1964	}
1965	return 0;
1966}
1967
1968static int tgsi_op2(struct r600_shader_ctx *ctx)
1969{
1970	return tgsi_op2_s(ctx, 0, 0);
1971}
1972
1973static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1974{
1975	return tgsi_op2_s(ctx, 1, 0);
1976}
1977
1978static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1979{
1980	return tgsi_op2_s(ctx, 0, 1);
1981}
1982
1983static int tgsi_ineg(struct r600_shader_ctx *ctx)
1984{
1985	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1986	struct r600_bytecode_alu alu;
1987	int i, r;
1988	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1989
1990	for (i = 0; i < lasti + 1; i++) {
1991
1992		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1993			continue;
1994		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1995		alu.inst = ctx->inst_info->r600_opcode;
1996
1997		alu.src[0].sel = V_SQ_ALU_SRC_0;
1998
1999		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2000
2001		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2002
2003		if (i == lasti) {
2004			alu.last = 1;
2005		}
2006		r = r600_bytecode_add_alu(ctx->bc, &alu);
2007		if (r)
2008			return r;
2009	}
2010	return 0;
2011
2012}
2013
2014static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2015{
2016	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2017	int i, j, r;
2018	struct r600_bytecode_alu alu;
2019	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2020
2021	for (i = 0 ; i < last_slot; i++) {
2022		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2023		alu.inst = ctx->inst_info->r600_opcode;
2024		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2025			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2026
2027			/* RSQ should take the absolute value of src */
2028			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
2029				r600_bytecode_src_set_abs(&alu.src[j]);
2030			}
2031		}
2032		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2033		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2034
2035		if (i == last_slot - 1)
2036			alu.last = 1;
2037		r = r600_bytecode_add_alu(ctx->bc, &alu);
2038		if (r)
2039			return r;
2040	}
2041	return 0;
2042}
2043
2044static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2045{
2046	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2047	int i, j, k, r;
2048	struct r600_bytecode_alu alu;
2049	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2050	for (k = 0; k < last_slot; k++) {
2051		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2052			continue;
2053
2054		for (i = 0 ; i < 4; i++) {
2055			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2056			alu.inst = ctx->inst_info->r600_opcode;
2057			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2058				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2059			}
2060			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2061			alu.dst.write = (i == k);
2062			if (i == 3)
2063				alu.last = 1;
2064			r = r600_bytecode_add_alu(ctx->bc, &alu);
2065			if (r)
2066				return r;
2067		}
2068	}
2069	return 0;
2070}
2071
2072/*
2073 * r600 - trunc to -PI..PI range
2074 * r700 - normalize by dividing by 2PI
2075 * see fdo bug 27901
2076 */
2077static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2078{
2079	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2080	static float double_pi = 3.1415926535 * 2;
2081	static float neg_pi = -3.1415926535;
2082
2083	int r;
2084	struct r600_bytecode_alu alu;
2085
2086	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2087	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2088	alu.is_op3 = 1;
2089
2090	alu.dst.chan = 0;
2091	alu.dst.sel = ctx->temp_reg;
2092	alu.dst.write = 1;
2093
2094	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2095
2096	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2097	alu.src[1].chan = 0;
2098	alu.src[1].value = *(uint32_t *)&half_inv_pi;
2099	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2100	alu.src[2].chan = 0;
2101	alu.last = 1;
2102	r = r600_bytecode_add_alu(ctx->bc, &alu);
2103	if (r)
2104		return r;
2105
2106	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2107	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
2108
2109	alu.dst.chan = 0;
2110	alu.dst.sel = ctx->temp_reg;
2111	alu.dst.write = 1;
2112
2113	alu.src[0].sel = ctx->temp_reg;
2114	alu.src[0].chan = 0;
2115	alu.last = 1;
2116	r = r600_bytecode_add_alu(ctx->bc, &alu);
2117	if (r)
2118		return r;
2119
2120	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2121	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
2122	alu.is_op3 = 1;
2123
2124	alu.dst.chan = 0;
2125	alu.dst.sel = ctx->temp_reg;
2126	alu.dst.write = 1;
2127
2128	alu.src[0].sel = ctx->temp_reg;
2129	alu.src[0].chan = 0;
2130
2131	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2132	alu.src[1].chan = 0;
2133	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2134	alu.src[2].chan = 0;
2135
2136	if (ctx->bc->chip_class == R600) {
2137		alu.src[1].value = *(uint32_t *)&double_pi;
2138		alu.src[2].value = *(uint32_t *)&neg_pi;
2139	} else {
2140		alu.src[1].sel = V_SQ_ALU_SRC_1;
2141		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2142		alu.src[2].neg = 1;
2143	}
2144
2145	alu.last = 1;
2146	r = r600_bytecode_add_alu(ctx->bc, &alu);
2147	if (r)
2148		return r;
2149	return 0;
2150}
2151
2152static int cayman_trig(struct r600_shader_ctx *ctx)
2153{
2154	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2155	struct r600_bytecode_alu alu;
2156	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2157	int i, r;
2158
2159	r = tgsi_setup_trig(ctx);
2160	if (r)
2161		return r;
2162
2163
2164	for (i = 0; i < last_slot; i++) {
2165		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2166		alu.inst = ctx->inst_info->r600_opcode;
2167		alu.dst.chan = i;
2168
2169		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2170		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2171
2172		alu.src[0].sel = ctx->temp_reg;
2173		alu.src[0].chan = 0;
2174		if (i == last_slot - 1)
2175			alu.last = 1;
2176		r = r600_bytecode_add_alu(ctx->bc, &alu);
2177		if (r)
2178			return r;
2179	}
2180	return 0;
2181}
2182
2183static int tgsi_trig(struct r600_shader_ctx *ctx)
2184{
2185	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2186	struct r600_bytecode_alu alu;
2187	int i, r;
2188	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2189
2190	r = tgsi_setup_trig(ctx);
2191	if (r)
2192		return r;
2193
2194	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2195	alu.inst = ctx->inst_info->r600_opcode;
2196	alu.dst.chan = 0;
2197	alu.dst.sel = ctx->temp_reg;
2198	alu.dst.write = 1;
2199
2200	alu.src[0].sel = ctx->temp_reg;
2201	alu.src[0].chan = 0;
2202	alu.last = 1;
2203	r = r600_bytecode_add_alu(ctx->bc, &alu);
2204	if (r)
2205		return r;
2206
2207	/* replicate result */
2208	for (i = 0; i < lasti + 1; i++) {
2209		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2210			continue;
2211
2212		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2213		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2214
2215		alu.src[0].sel = ctx->temp_reg;
2216		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2217		if (i == lasti)
2218			alu.last = 1;
2219		r = r600_bytecode_add_alu(ctx->bc, &alu);
2220		if (r)
2221			return r;
2222	}
2223	return 0;
2224}
2225
2226static int tgsi_scs(struct r600_shader_ctx *ctx)
2227{
2228	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2229	struct r600_bytecode_alu alu;
2230	int i, r;
2231
2232	/* We'll only need the trig stuff if we are going to write to the
2233	 * X or Y components of the destination vector.
2234	 */
2235	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2236		r = tgsi_setup_trig(ctx);
2237		if (r)
2238			return r;
2239	}
2240
2241	/* dst.x = COS */
2242	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2243		if (ctx->bc->chip_class == CAYMAN) {
2244			for (i = 0 ; i < 3; i++) {
2245				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2246				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2247				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2248
2249				if (i == 0)
2250					alu.dst.write = 1;
2251				else
2252					alu.dst.write = 0;
2253				alu.src[0].sel = ctx->temp_reg;
2254				alu.src[0].chan = 0;
2255				if (i == 2)
2256					alu.last = 1;
2257				r = r600_bytecode_add_alu(ctx->bc, &alu);
2258				if (r)
2259					return r;
2260			}
2261		} else {
2262			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2263			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2264			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2265
2266			alu.src[0].sel = ctx->temp_reg;
2267			alu.src[0].chan = 0;
2268			alu.last = 1;
2269			r = r600_bytecode_add_alu(ctx->bc, &alu);
2270			if (r)
2271				return r;
2272		}
2273	}
2274
2275	/* dst.y = SIN */
2276	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2277		if (ctx->bc->chip_class == CAYMAN) {
2278			for (i = 0 ; i < 3; i++) {
2279				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2280				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2281				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2282				if (i == 1)
2283					alu.dst.write = 1;
2284				else
2285					alu.dst.write = 0;
2286				alu.src[0].sel = ctx->temp_reg;
2287				alu.src[0].chan = 0;
2288				if (i == 2)
2289					alu.last = 1;
2290				r = r600_bytecode_add_alu(ctx->bc, &alu);
2291				if (r)
2292					return r;
2293			}
2294		} else {
2295			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2296			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2297			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2298
2299			alu.src[0].sel = ctx->temp_reg;
2300			alu.src[0].chan = 0;
2301			alu.last = 1;
2302			r = r600_bytecode_add_alu(ctx->bc, &alu);
2303			if (r)
2304				return r;
2305		}
2306	}
2307
2308	/* dst.z = 0.0; */
2309	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2310		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2311
2312		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2313
2314		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2315
2316		alu.src[0].sel = V_SQ_ALU_SRC_0;
2317		alu.src[0].chan = 0;
2318
2319		alu.last = 1;
2320
2321		r = r600_bytecode_add_alu(ctx->bc, &alu);
2322		if (r)
2323			return r;
2324	}
2325
2326	/* dst.w = 1.0; */
2327	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2328		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2329
2330		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2331
2332		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2333
2334		alu.src[0].sel = V_SQ_ALU_SRC_1;
2335		alu.src[0].chan = 0;
2336
2337		alu.last = 1;
2338
2339		r = r600_bytecode_add_alu(ctx->bc, &alu);
2340		if (r)
2341			return r;
2342	}
2343
2344	return 0;
2345}
2346
2347static int tgsi_kill(struct r600_shader_ctx *ctx)
2348{
2349	struct r600_bytecode_alu alu;
2350	int i, r;
2351
2352	for (i = 0; i < 4; i++) {
2353		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2354		alu.inst = ctx->inst_info->r600_opcode;
2355
2356		alu.dst.chan = i;
2357
2358		alu.src[0].sel = V_SQ_ALU_SRC_0;
2359
2360		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2361			alu.src[1].sel = V_SQ_ALU_SRC_1;
2362			alu.src[1].neg = 1;
2363		} else {
2364			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2365		}
2366		if (i == 3) {
2367			alu.last = 1;
2368		}
2369		r = r600_bytecode_add_alu(ctx->bc, &alu);
2370		if (r)
2371			return r;
2372	}
2373
2374	/* kill must be last in ALU */
2375	ctx->bc->force_add_cf = 1;
2376	ctx->shader->uses_kill = TRUE;
2377	return 0;
2378}
2379
2380static int tgsi_lit(struct r600_shader_ctx *ctx)
2381{
2382	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2383	struct r600_bytecode_alu alu;
2384	int r;
2385
2386	/* tmp.x = max(src.y, 0.0) */
2387	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2388	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2389	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2390	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2391	alu.src[1].chan = 1;
2392
2393	alu.dst.sel = ctx->temp_reg;
2394	alu.dst.chan = 0;
2395	alu.dst.write = 1;
2396
2397	alu.last = 1;
2398	r = r600_bytecode_add_alu(ctx->bc, &alu);
2399	if (r)
2400		return r;
2401
2402	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2403	{
2404		int chan;
2405		int sel;
2406		int i;
2407
2408		if (ctx->bc->chip_class == CAYMAN) {
2409			for (i = 0; i < 3; i++) {
2410				/* tmp.z = log(tmp.x) */
2411				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2412				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2413				alu.src[0].sel = ctx->temp_reg;
2414				alu.src[0].chan = 0;
2415				alu.dst.sel = ctx->temp_reg;
2416				alu.dst.chan = i;
2417				if (i == 2) {
2418					alu.dst.write = 1;
2419					alu.last = 1;
2420				} else
2421					alu.dst.write = 0;
2422
2423				r = r600_bytecode_add_alu(ctx->bc, &alu);
2424				if (r)
2425					return r;
2426			}
2427		} else {
2428			/* tmp.z = log(tmp.x) */
2429			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2430			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2431			alu.src[0].sel = ctx->temp_reg;
2432			alu.src[0].chan = 0;
2433			alu.dst.sel = ctx->temp_reg;
2434			alu.dst.chan = 2;
2435			alu.dst.write = 1;
2436			alu.last = 1;
2437			r = r600_bytecode_add_alu(ctx->bc, &alu);
2438			if (r)
2439				return r;
2440		}
2441
2442		chan = alu.dst.chan;
2443		sel = alu.dst.sel;
2444
2445		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2446		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2447		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2448		alu.src[0].sel  = sel;
2449		alu.src[0].chan = chan;
2450		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2451		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2452		alu.dst.sel = ctx->temp_reg;
2453		alu.dst.chan = 0;
2454		alu.dst.write = 1;
2455		alu.is_op3 = 1;
2456		alu.last = 1;
2457		r = r600_bytecode_add_alu(ctx->bc, &alu);
2458		if (r)
2459			return r;
2460
2461		if (ctx->bc->chip_class == CAYMAN) {
2462			for (i = 0; i < 3; i++) {
2463				/* dst.z = exp(tmp.x) */
2464				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2465				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2466				alu.src[0].sel = ctx->temp_reg;
2467				alu.src[0].chan = 0;
2468				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2469				if (i == 2) {
2470					alu.dst.write = 1;
2471					alu.last = 1;
2472				} else
2473					alu.dst.write = 0;
2474				r = r600_bytecode_add_alu(ctx->bc, &alu);
2475				if (r)
2476					return r;
2477			}
2478		} else {
2479			/* dst.z = exp(tmp.x) */
2480			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2481			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2482			alu.src[0].sel = ctx->temp_reg;
2483			alu.src[0].chan = 0;
2484			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2485			alu.last = 1;
2486			r = r600_bytecode_add_alu(ctx->bc, &alu);
2487			if (r)
2488				return r;
2489		}
2490	}
2491
2492	/* dst.x, <- 1.0  */
2493	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2494	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2495	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2496	alu.src[0].chan = 0;
2497	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2498	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2499	r = r600_bytecode_add_alu(ctx->bc, &alu);
2500	if (r)
2501		return r;
2502
2503	/* dst.y = max(src.x, 0.0) */
2504	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2505	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2506	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2507	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2508	alu.src[1].chan = 0;
2509	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2510	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2511	r = r600_bytecode_add_alu(ctx->bc, &alu);
2512	if (r)
2513		return r;
2514
2515	/* dst.w, <- 1.0  */
2516	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2517	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2518	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2519	alu.src[0].chan = 0;
2520	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2521	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2522	alu.last = 1;
2523	r = r600_bytecode_add_alu(ctx->bc, &alu);
2524	if (r)
2525		return r;
2526
2527	return 0;
2528}
2529
2530static int tgsi_rsq(struct r600_shader_ctx *ctx)
2531{
2532	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2533	struct r600_bytecode_alu alu;
2534	int i, r;
2535
2536	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2537
2538	/* XXX:
2539	 * For state trackers other than OpenGL, we'll want to use
2540	 * _RECIPSQRT_IEEE instead.
2541	 */
2542	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2543
2544	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2545		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2546		r600_bytecode_src_set_abs(&alu.src[i]);
2547	}
2548	alu.dst.sel = ctx->temp_reg;
2549	alu.dst.write = 1;
2550	alu.last = 1;
2551	r = r600_bytecode_add_alu(ctx->bc, &alu);
2552	if (r)
2553		return r;
2554	/* replicate result */
2555	return tgsi_helper_tempx_replicate(ctx);
2556}
2557
2558static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2559{
2560	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2561	struct r600_bytecode_alu alu;
2562	int i, r;
2563
2564	for (i = 0; i < 4; i++) {
2565		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2566		alu.src[0].sel = ctx->temp_reg;
2567		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2568		alu.dst.chan = i;
2569		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2570		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2571		if (i == 3)
2572			alu.last = 1;
2573		r = r600_bytecode_add_alu(ctx->bc, &alu);
2574		if (r)
2575			return r;
2576	}
2577	return 0;
2578}
2579
2580static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2581{
2582	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2583	struct r600_bytecode_alu alu;
2584	int i, r;
2585
2586	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2587	alu.inst = ctx->inst_info->r600_opcode;
2588	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2589		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2590	}
2591	alu.dst.sel = ctx->temp_reg;
2592	alu.dst.write = 1;
2593	alu.last = 1;
2594	r = r600_bytecode_add_alu(ctx->bc, &alu);
2595	if (r)
2596		return r;
2597	/* replicate result */
2598	return tgsi_helper_tempx_replicate(ctx);
2599}
2600
2601static int cayman_pow(struct r600_shader_ctx *ctx)
2602{
2603	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2604	int i, r;
2605	struct r600_bytecode_alu alu;
2606	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2607
2608	for (i = 0; i < 3; i++) {
2609		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2610		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2611		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2612		alu.dst.sel = ctx->temp_reg;
2613		alu.dst.chan = i;
2614		alu.dst.write = 1;
2615		if (i == 2)
2616			alu.last = 1;
2617		r = r600_bytecode_add_alu(ctx->bc, &alu);
2618		if (r)
2619			return r;
2620	}
2621
2622	/* b * LOG2(a) */
2623	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2624	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2625	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2626	alu.src[1].sel = ctx->temp_reg;
2627	alu.dst.sel = ctx->temp_reg;
2628	alu.dst.write = 1;
2629	alu.last = 1;
2630	r = r600_bytecode_add_alu(ctx->bc, &alu);
2631	if (r)
2632		return r;
2633
2634	for (i = 0; i < last_slot; i++) {
2635		/* POW(a,b) = EXP2(b * LOG2(a))*/
2636		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2637		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2638		alu.src[0].sel = ctx->temp_reg;
2639
2640		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2641		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2642		if (i == last_slot - 1)
2643			alu.last = 1;
2644		r = r600_bytecode_add_alu(ctx->bc, &alu);
2645		if (r)
2646			return r;
2647	}
2648	return 0;
2649}
2650
2651static int tgsi_pow(struct r600_shader_ctx *ctx)
2652{
2653	struct r600_bytecode_alu alu;
2654	int r;
2655
2656	/* LOG2(a) */
2657	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2658	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2659	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2660	alu.dst.sel = ctx->temp_reg;
2661	alu.dst.write = 1;
2662	alu.last = 1;
2663	r = r600_bytecode_add_alu(ctx->bc, &alu);
2664	if (r)
2665		return r;
2666	/* b * LOG2(a) */
2667	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2668	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2669	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2670	alu.src[1].sel = ctx->temp_reg;
2671	alu.dst.sel = ctx->temp_reg;
2672	alu.dst.write = 1;
2673	alu.last = 1;
2674	r = r600_bytecode_add_alu(ctx->bc, &alu);
2675	if (r)
2676		return r;
2677	/* POW(a,b) = EXP2(b * LOG2(a))*/
2678	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2679	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2680	alu.src[0].sel = ctx->temp_reg;
2681	alu.dst.sel = ctx->temp_reg;
2682	alu.dst.write = 1;
2683	alu.last = 1;
2684	r = r600_bytecode_add_alu(ctx->bc, &alu);
2685	if (r)
2686		return r;
2687	return tgsi_helper_tempx_replicate(ctx);
2688}
2689
2690static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2691{
2692	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2693	struct r600_bytecode_alu alu;
2694	int i, r, j;
2695	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2696	int tmp0 = ctx->temp_reg;
2697	int tmp1 = r600_get_temp(ctx);
2698	int tmp2 = r600_get_temp(ctx);
2699	int tmp3 = r600_get_temp(ctx);
2700	/* Unsigned path:
2701	 *
2702	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2703	 *
2704	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2705	 * 2. tmp0.z = lo (tmp0.x * src2)
2706	 * 3. tmp0.w = -tmp0.z
2707	 * 4. tmp0.y = hi (tmp0.x * src2)
2708	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2709	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2710	 * 7. tmp1.x = tmp0.x - tmp0.w
2711	 * 8. tmp1.y = tmp0.x + tmp0.w
2712	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2713	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2714	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2715	 *
2716	 * 12. tmp0.w = src1 - tmp0.y       = r
2717	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2718	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2719	 *
2720	 * if DIV
2721	 *
2722	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2723	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2724	 *
2725	 * else MOD
2726	 *
2727	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2728	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2729	 *
2730	 * endif
2731	 *
2732	 * 17. tmp1.x = tmp1.x & tmp1.y
2733	 *
2734	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2735	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2736	 *
2737	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2738	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2739	 *
2740	 * Signed path:
2741	 *
2742	 * Same as unsigned, using abs values of the operands,
2743	 * and fixing the sign of the result in the end.
2744	 */
2745
2746	for (i = 0; i < 4; i++) {
2747		if (!(write_mask & (1<<i)))
2748			continue;
2749
2750		if (signed_op) {
2751
2752			/* tmp2.x = -src0 */
2753			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2754			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2755
2756			alu.dst.sel = tmp2;
2757			alu.dst.chan = 0;
2758			alu.dst.write = 1;
2759
2760			alu.src[0].sel = V_SQ_ALU_SRC_0;
2761
2762			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2763
2764			alu.last = 1;
2765			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2766				return r;
2767
2768			/* tmp2.y = -src1 */
2769			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2770			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2771
2772			alu.dst.sel = tmp2;
2773			alu.dst.chan = 1;
2774			alu.dst.write = 1;
2775
2776			alu.src[0].sel = V_SQ_ALU_SRC_0;
2777
2778			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2779
2780			alu.last = 1;
2781			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2782				return r;
2783
2784			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2785			/* it will be a sign of the quotient */
2786			if (!mod) {
2787
2788				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2789				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2790
2791				alu.dst.sel = tmp2;
2792				alu.dst.chan = 2;
2793				alu.dst.write = 1;
2794
2795				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2796				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2797
2798				alu.last = 1;
2799				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2800					return r;
2801			}
2802
2803			/* tmp2.x = |src0| */
2804			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2805			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2806			alu.is_op3 = 1;
2807
2808			alu.dst.sel = tmp2;
2809			alu.dst.chan = 0;
2810			alu.dst.write = 1;
2811
2812			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2813			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2814			alu.src[2].sel = tmp2;
2815			alu.src[2].chan = 0;
2816
2817			alu.last = 1;
2818			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2819				return r;
2820
2821			/* tmp2.y = |src1| */
2822			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2823			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2824			alu.is_op3 = 1;
2825
2826			alu.dst.sel = tmp2;
2827			alu.dst.chan = 1;
2828			alu.dst.write = 1;
2829
2830			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2831			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2832			alu.src[2].sel = tmp2;
2833			alu.src[2].chan = 1;
2834
2835			alu.last = 1;
2836			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2837				return r;
2838
2839		}
2840
2841		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2842		if (ctx->bc->chip_class == CAYMAN) {
2843			/* tmp3.x = u2f(src2) */
2844			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2845			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2846
2847			alu.dst.sel = tmp3;
2848			alu.dst.chan = 0;
2849			alu.dst.write = 1;
2850
2851			if (signed_op) {
2852				alu.src[0].sel = tmp2;
2853				alu.src[0].chan = 1;
2854			} else {
2855				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2856			}
2857
2858			alu.last = 1;
2859			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2860				return r;
2861
2862			/* tmp0.x = recip(tmp3.x) */
2863			for (j = 0 ; j < 3; j++) {
2864				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2865				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2866
2867				alu.dst.sel = tmp0;
2868				alu.dst.chan = j;
2869				alu.dst.write = (j == 0);
2870
2871				alu.src[0].sel = tmp3;
2872				alu.src[0].chan = 0;
2873
2874				if (j == 2)
2875					alu.last = 1;
2876				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2877					return r;
2878			}
2879
2880			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2881			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2882
2883			alu.src[0].sel = tmp0;
2884			alu.src[0].chan = 0;
2885
2886			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2887			alu.src[1].value = 0x4f800000;
2888
2889			alu.dst.sel = tmp3;
2890			alu.dst.write = 1;
2891			alu.last = 1;
2892			r = r600_bytecode_add_alu(ctx->bc, &alu);
2893			if (r)
2894				return r;
2895
2896			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2897			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2898
2899			alu.dst.sel = tmp0;
2900			alu.dst.chan = 0;
2901			alu.dst.write = 1;
2902
2903			alu.src[0].sel = tmp3;
2904			alu.src[0].chan = 0;
2905
2906			alu.last = 1;
2907			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2908				return r;
2909
2910		} else {
2911			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2912			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2913
2914			alu.dst.sel = tmp0;
2915			alu.dst.chan = 0;
2916			alu.dst.write = 1;
2917
2918			if (signed_op) {
2919				alu.src[0].sel = tmp2;
2920				alu.src[0].chan = 1;
2921			} else {
2922				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2923			}
2924
2925			alu.last = 1;
2926			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2927				return r;
2928		}
2929
2930		/* 2. tmp0.z = lo (tmp0.x * src2) */
2931		if (ctx->bc->chip_class == CAYMAN) {
2932			for (j = 0 ; j < 4; j++) {
2933				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2934				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2935
2936				alu.dst.sel = tmp0;
2937				alu.dst.chan = j;
2938				alu.dst.write = (j == 2);
2939
2940				alu.src[0].sel = tmp0;
2941				alu.src[0].chan = 0;
2942				if (signed_op) {
2943					alu.src[1].sel = tmp2;
2944					alu.src[1].chan = 1;
2945				} else {
2946					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2947				}
2948
2949				alu.last = (j == 3);
2950				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2951					return r;
2952			}
2953		} else {
2954			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2955			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2956
2957			alu.dst.sel = tmp0;
2958			alu.dst.chan = 2;
2959			alu.dst.write = 1;
2960
2961			alu.src[0].sel = tmp0;
2962			alu.src[0].chan = 0;
2963			if (signed_op) {
2964				alu.src[1].sel = tmp2;
2965				alu.src[1].chan = 1;
2966			} else {
2967				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2968			}
2969
2970			alu.last = 1;
2971			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2972				return r;
2973		}
2974
2975		/* 3. tmp0.w = -tmp0.z */
2976		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2977		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2978
2979		alu.dst.sel = tmp0;
2980		alu.dst.chan = 3;
2981		alu.dst.write = 1;
2982
2983		alu.src[0].sel = V_SQ_ALU_SRC_0;
2984		alu.src[1].sel = tmp0;
2985		alu.src[1].chan = 2;
2986
2987		alu.last = 1;
2988		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2989			return r;
2990
2991		/* 4. tmp0.y = hi (tmp0.x * src2) */
2992		if (ctx->bc->chip_class == CAYMAN) {
2993			for (j = 0 ; j < 4; j++) {
2994				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2995				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2996
2997				alu.dst.sel = tmp0;
2998				alu.dst.chan = j;
2999				alu.dst.write = (j == 1);
3000
3001				alu.src[0].sel = tmp0;
3002				alu.src[0].chan = 0;
3003
3004				if (signed_op) {
3005					alu.src[1].sel = tmp2;
3006					alu.src[1].chan = 1;
3007				} else {
3008					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3009				}
3010				alu.last = (j == 3);
3011				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3012					return r;
3013			}
3014		} else {
3015			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3016			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3017
3018			alu.dst.sel = tmp0;
3019			alu.dst.chan = 1;
3020			alu.dst.write = 1;
3021
3022			alu.src[0].sel = tmp0;
3023			alu.src[0].chan = 0;
3024
3025			if (signed_op) {
3026				alu.src[1].sel = tmp2;
3027				alu.src[1].chan = 1;
3028			} else {
3029				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3030			}
3031
3032			alu.last = 1;
3033			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3034				return r;
3035		}
3036
3037		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
3038		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3039		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3040		alu.is_op3 = 1;
3041
3042		alu.dst.sel = tmp0;
3043		alu.dst.chan = 2;
3044		alu.dst.write = 1;
3045
3046		alu.src[0].sel = tmp0;
3047		alu.src[0].chan = 1;
3048		alu.src[1].sel = tmp0;
3049		alu.src[1].chan = 3;
3050		alu.src[2].sel = tmp0;
3051		alu.src[2].chan = 2;
3052
3053		alu.last = 1;
3054		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3055			return r;
3056
3057		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
3058		if (ctx->bc->chip_class == CAYMAN) {
3059			for (j = 0 ; j < 4; j++) {
3060				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3061				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3062
3063				alu.dst.sel = tmp0;
3064				alu.dst.chan = j;
3065				alu.dst.write = (j == 3);
3066
3067				alu.src[0].sel = tmp0;
3068				alu.src[0].chan = 2;
3069
3070				alu.src[1].sel = tmp0;
3071				alu.src[1].chan = 0;
3072
3073				alu.last = (j == 3);
3074				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3075					return r;
3076			}
3077		} else {
3078			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3079			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3080
3081			alu.dst.sel = tmp0;
3082			alu.dst.chan = 3;
3083			alu.dst.write = 1;
3084
3085			alu.src[0].sel = tmp0;
3086			alu.src[0].chan = 2;
3087
3088			alu.src[1].sel = tmp0;
3089			alu.src[1].chan = 0;
3090
3091			alu.last = 1;
3092			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3093				return r;
3094		}
3095
3096		/* 7. tmp1.x = tmp0.x - tmp0.w */
3097		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3098		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3099
3100		alu.dst.sel = tmp1;
3101		alu.dst.chan = 0;
3102		alu.dst.write = 1;
3103
3104		alu.src[0].sel = tmp0;
3105		alu.src[0].chan = 0;
3106		alu.src[1].sel = tmp0;
3107		alu.src[1].chan = 3;
3108
3109		alu.last = 1;
3110		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3111			return r;
3112
3113		/* 8. tmp1.y = tmp0.x + tmp0.w */
3114		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3115		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3116
3117		alu.dst.sel = tmp1;
3118		alu.dst.chan = 1;
3119		alu.dst.write = 1;
3120
3121		alu.src[0].sel = tmp0;
3122		alu.src[0].chan = 0;
3123		alu.src[1].sel = tmp0;
3124		alu.src[1].chan = 3;
3125
3126		alu.last = 1;
3127		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3128			return r;
3129
3130		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3131		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3132		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3133		alu.is_op3 = 1;
3134
3135		alu.dst.sel = tmp0;
3136		alu.dst.chan = 0;
3137		alu.dst.write = 1;
3138
3139		alu.src[0].sel = tmp0;
3140		alu.src[0].chan = 1;
3141		alu.src[1].sel = tmp1;
3142		alu.src[1].chan = 1;
3143		alu.src[2].sel = tmp1;
3144		alu.src[2].chan = 0;
3145
3146		alu.last = 1;
3147		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3148			return r;
3149
3150		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3151		if (ctx->bc->chip_class == CAYMAN) {
3152			for (j = 0 ; j < 4; j++) {
3153				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3154				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3155
3156				alu.dst.sel = tmp0;
3157				alu.dst.chan = j;
3158				alu.dst.write = (j == 2);
3159
3160				alu.src[0].sel = tmp0;
3161				alu.src[0].chan = 0;
3162
3163				if (signed_op) {
3164					alu.src[1].sel = tmp2;
3165					alu.src[1].chan = 0;
3166				} else {
3167					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3168				}
3169
3170				alu.last = (j == 3);
3171				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3172					return r;
3173			}
3174		} else {
3175			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3176			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3177
3178			alu.dst.sel = tmp0;
3179			alu.dst.chan = 2;
3180			alu.dst.write = 1;
3181
3182			alu.src[0].sel = tmp0;
3183			alu.src[0].chan = 0;
3184
3185			if (signed_op) {
3186				alu.src[1].sel = tmp2;
3187				alu.src[1].chan = 0;
3188			} else {
3189				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3190			}
3191
3192			alu.last = 1;
3193			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3194				return r;
3195		}
3196
3197		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3198		if (ctx->bc->chip_class == CAYMAN) {
3199			for (j = 0 ; j < 4; j++) {
3200				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3201				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3202
3203				alu.dst.sel = tmp0;
3204				alu.dst.chan = j;
3205				alu.dst.write = (j == 1);
3206
3207				if (signed_op) {
3208					alu.src[0].sel = tmp2;
3209					alu.src[0].chan = 1;
3210				} else {
3211					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3212				}
3213
3214				alu.src[1].sel = tmp0;
3215				alu.src[1].chan = 2;
3216
3217				alu.last = (j == 3);
3218				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3219					return r;
3220			}
3221		} else {
3222			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3223			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3224
3225			alu.dst.sel = tmp0;
3226			alu.dst.chan = 1;
3227			alu.dst.write = 1;
3228
3229			if (signed_op) {
3230				alu.src[0].sel = tmp2;
3231				alu.src[0].chan = 1;
3232			} else {
3233				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3234			}
3235
3236			alu.src[1].sel = tmp0;
3237			alu.src[1].chan = 2;
3238
3239			alu.last = 1;
3240			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3241				return r;
3242		}
3243
3244		/* 12. tmp0.w = src1 - tmp0.y       = r */
3245		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3246		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3247
3248		alu.dst.sel = tmp0;
3249		alu.dst.chan = 3;
3250		alu.dst.write = 1;
3251
3252		if (signed_op) {
3253			alu.src[0].sel = tmp2;
3254			alu.src[0].chan = 0;
3255		} else {
3256			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3257		}
3258
3259		alu.src[1].sel = tmp0;
3260		alu.src[1].chan = 1;
3261
3262		alu.last = 1;
3263		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3264			return r;
3265
3266		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3267		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3268		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3269
3270		alu.dst.sel = tmp1;
3271		alu.dst.chan = 0;
3272		alu.dst.write = 1;
3273
3274		alu.src[0].sel = tmp0;
3275		alu.src[0].chan = 3;
3276		if (signed_op) {
3277			alu.src[1].sel = tmp2;
3278			alu.src[1].chan = 1;
3279		} else {
3280			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3281		}
3282
3283		alu.last = 1;
3284		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3285			return r;
3286
3287		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3288		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3289		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3290
3291		alu.dst.sel = tmp1;
3292		alu.dst.chan = 1;
3293		alu.dst.write = 1;
3294
3295		if (signed_op) {
3296			alu.src[0].sel = tmp2;
3297			alu.src[0].chan = 0;
3298		} else {
3299			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3300		}
3301
3302		alu.src[1].sel = tmp0;
3303		alu.src[1].chan = 1;
3304
3305		alu.last = 1;
3306		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3307			return r;
3308
3309		if (mod) { /* UMOD */
3310
3311			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3312			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3313			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3314
3315			alu.dst.sel = tmp1;
3316			alu.dst.chan = 2;
3317			alu.dst.write = 1;
3318
3319			alu.src[0].sel = tmp0;
3320			alu.src[0].chan = 3;
3321
3322			if (signed_op) {
3323				alu.src[1].sel = tmp2;
3324				alu.src[1].chan = 1;
3325			} else {
3326				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3327			}
3328
3329			alu.last = 1;
3330			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3331				return r;
3332
3333			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3334			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3335			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3336
3337			alu.dst.sel = tmp1;
3338			alu.dst.chan = 3;
3339			alu.dst.write = 1;
3340
3341			alu.src[0].sel = tmp0;
3342			alu.src[0].chan = 3;
3343			if (signed_op) {
3344				alu.src[1].sel = tmp2;
3345				alu.src[1].chan = 1;
3346			} else {
3347				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3348			}
3349
3350			alu.last = 1;
3351			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3352				return r;
3353
3354		} else { /* UDIV */
3355
3356			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3357			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3358			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3359
3360			alu.dst.sel = tmp1;
3361			alu.dst.chan = 2;
3362			alu.dst.write = 1;
3363
3364			alu.src[0].sel = tmp0;
3365			alu.src[0].chan = 2;
3366			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3367
3368			alu.last = 1;
3369			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3370				return r;
3371
3372			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3373			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3374			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3375
3376			alu.dst.sel = tmp1;
3377			alu.dst.chan = 3;
3378			alu.dst.write = 1;
3379
3380			alu.src[0].sel = tmp0;
3381			alu.src[0].chan = 2;
3382			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3383
3384			alu.last = 1;
3385			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3386				return r;
3387
3388		}
3389
3390		/* 17. tmp1.x = tmp1.x & tmp1.y */
3391		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3392		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3393
3394		alu.dst.sel = tmp1;
3395		alu.dst.chan = 0;
3396		alu.dst.write = 1;
3397
3398		alu.src[0].sel = tmp1;
3399		alu.src[0].chan = 0;
3400		alu.src[1].sel = tmp1;
3401		alu.src[1].chan = 1;
3402
3403		alu.last = 1;
3404		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3405			return r;
3406
3407		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3408		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3409		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3410		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3411		alu.is_op3 = 1;
3412
3413		alu.dst.sel = tmp0;
3414		alu.dst.chan = 2;
3415		alu.dst.write = 1;
3416
3417		alu.src[0].sel = tmp1;
3418		alu.src[0].chan = 0;
3419		alu.src[1].sel = tmp0;
3420		alu.src[1].chan = mod ? 3 : 2;
3421		alu.src[2].sel = tmp1;
3422		alu.src[2].chan = 2;
3423
3424		alu.last = 1;
3425		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3426			return r;
3427
3428		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3429		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3430		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3431		alu.is_op3 = 1;
3432
3433		if (signed_op) {
3434			alu.dst.sel = tmp0;
3435			alu.dst.chan = 2;
3436			alu.dst.write = 1;
3437		} else {
3438			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3439		}
3440
3441		alu.src[0].sel = tmp1;
3442		alu.src[0].chan = 1;
3443		alu.src[1].sel = tmp1;
3444		alu.src[1].chan = 3;
3445		alu.src[2].sel = tmp0;
3446		alu.src[2].chan = 2;
3447
3448		alu.last = 1;
3449		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3450			return r;
3451
3452		if (signed_op) {
3453
3454			/* fix the sign of the result */
3455
3456			if (mod) {
3457
3458				/* tmp0.x = -tmp0.z */
3459				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3460				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3461
3462				alu.dst.sel = tmp0;
3463				alu.dst.chan = 0;
3464				alu.dst.write = 1;
3465
3466				alu.src[0].sel = V_SQ_ALU_SRC_0;
3467				alu.src[1].sel = tmp0;
3468				alu.src[1].chan = 2;
3469
3470				alu.last = 1;
3471				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3472					return r;
3473
3474				/* sign of the remainder is the same as the sign of src0 */
3475				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3476				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3477				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3478				alu.is_op3 = 1;
3479
3480				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3481
3482				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3483				alu.src[1].sel = tmp0;
3484				alu.src[1].chan = 2;
3485				alu.src[2].sel = tmp0;
3486				alu.src[2].chan = 0;
3487
3488				alu.last = 1;
3489				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3490					return r;
3491
3492			} else {
3493
3494				/* tmp0.x = -tmp0.z */
3495				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3496				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3497
3498				alu.dst.sel = tmp0;
3499				alu.dst.chan = 0;
3500				alu.dst.write = 1;
3501
3502				alu.src[0].sel = V_SQ_ALU_SRC_0;
3503				alu.src[1].sel = tmp0;
3504				alu.src[1].chan = 2;
3505
3506				alu.last = 1;
3507				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3508					return r;
3509
3510				/* fix the quotient sign (same as the sign of src0*src1) */
3511				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3512				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3513				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3514				alu.is_op3 = 1;
3515
3516				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3517
3518				alu.src[0].sel = tmp2;
3519				alu.src[0].chan = 2;
3520				alu.src[1].sel = tmp0;
3521				alu.src[1].chan = 2;
3522				alu.src[2].sel = tmp0;
3523				alu.src[2].chan = 0;
3524
3525				alu.last = 1;
3526				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3527					return r;
3528			}
3529		}
3530	}
3531	return 0;
3532}
3533
3534static int tgsi_udiv(struct r600_shader_ctx *ctx)
3535{
3536	return tgsi_divmod(ctx, 0, 0);
3537}
3538
3539static int tgsi_umod(struct r600_shader_ctx *ctx)
3540{
3541	return tgsi_divmod(ctx, 1, 0);
3542}
3543
3544static int tgsi_idiv(struct r600_shader_ctx *ctx)
3545{
3546	return tgsi_divmod(ctx, 0, 1);
3547}
3548
3549static int tgsi_imod(struct r600_shader_ctx *ctx)
3550{
3551	return tgsi_divmod(ctx, 1, 1);
3552}
3553
3554
3555static int tgsi_f2i(struct r600_shader_ctx *ctx)
3556{
3557	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3558	struct r600_bytecode_alu alu;
3559	int i, r;
3560	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3561	int last_inst = tgsi_last_instruction(write_mask);
3562
3563	for (i = 0; i < 4; i++) {
3564		if (!(write_mask & (1<<i)))
3565			continue;
3566
3567		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3568		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3569
3570		alu.dst.sel = ctx->temp_reg;
3571		alu.dst.chan = i;
3572		alu.dst.write = 1;
3573
3574		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3575		if (i == last_inst)
3576			alu.last = 1;
3577		r = r600_bytecode_add_alu(ctx->bc, &alu);
3578		if (r)
3579			return r;
3580	}
3581
3582	for (i = 0; i < 4; i++) {
3583		if (!(write_mask & (1<<i)))
3584			continue;
3585
3586		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3587		alu.inst = ctx->inst_info->r600_opcode;
3588
3589		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3590
3591		alu.src[0].sel = ctx->temp_reg;
3592		alu.src[0].chan = i;
3593
3594		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3595			alu.last = 1;
3596		r = r600_bytecode_add_alu(ctx->bc, &alu);
3597		if (r)
3598			return r;
3599	}
3600
3601	return 0;
3602}
3603
3604static int tgsi_iabs(struct r600_shader_ctx *ctx)
3605{
3606	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3607	struct r600_bytecode_alu alu;
3608	int i, r;
3609	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3610	int last_inst = tgsi_last_instruction(write_mask);
3611
3612	/* tmp = -src */
3613	for (i = 0; i < 4; i++) {
3614		if (!(write_mask & (1<<i)))
3615			continue;
3616
3617		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3618		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3619
3620		alu.dst.sel = ctx->temp_reg;
3621		alu.dst.chan = i;
3622		alu.dst.write = 1;
3623
3624		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3625		alu.src[0].sel = V_SQ_ALU_SRC_0;
3626
3627		if (i == last_inst)
3628			alu.last = 1;
3629		r = r600_bytecode_add_alu(ctx->bc, &alu);
3630		if (r)
3631			return r;
3632	}
3633
3634	/* dst = (src >= 0 ? src : tmp) */
3635	for (i = 0; i < 4; i++) {
3636		if (!(write_mask & (1<<i)))
3637			continue;
3638
3639		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3640		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3641		alu.is_op3 = 1;
3642		alu.dst.write = 1;
3643
3644		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3645
3646		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3647		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3648		alu.src[2].sel = ctx->temp_reg;
3649		alu.src[2].chan = i;
3650
3651		if (i == last_inst)
3652			alu.last = 1;
3653		r = r600_bytecode_add_alu(ctx->bc, &alu);
3654		if (r)
3655			return r;
3656	}
3657	return 0;
3658}
3659
3660static int tgsi_issg(struct r600_shader_ctx *ctx)
3661{
3662	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3663	struct r600_bytecode_alu alu;
3664	int i, r;
3665	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3666	int last_inst = tgsi_last_instruction(write_mask);
3667
3668	/* tmp = (src >= 0 ? src : -1) */
3669	for (i = 0; i < 4; i++) {
3670		if (!(write_mask & (1<<i)))
3671			continue;
3672
3673		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3674		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3675		alu.is_op3 = 1;
3676
3677		alu.dst.sel = ctx->temp_reg;
3678		alu.dst.chan = i;
3679		alu.dst.write = 1;
3680
3681		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3682		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3683		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3684
3685		if (i == last_inst)
3686			alu.last = 1;
3687		r = r600_bytecode_add_alu(ctx->bc, &alu);
3688		if (r)
3689			return r;
3690	}
3691
3692	/* dst = (tmp > 0 ? 1 : tmp) */
3693	for (i = 0; i < 4; i++) {
3694		if (!(write_mask & (1<<i)))
3695			continue;
3696
3697		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3698		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3699		alu.is_op3 = 1;
3700		alu.dst.write = 1;
3701
3702		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3703
3704		alu.src[0].sel = ctx->temp_reg;
3705		alu.src[0].chan = i;
3706
3707		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3708
3709		alu.src[2].sel = ctx->temp_reg;
3710		alu.src[2].chan = i;
3711
3712		if (i == last_inst)
3713			alu.last = 1;
3714		r = r600_bytecode_add_alu(ctx->bc, &alu);
3715		if (r)
3716			return r;
3717	}
3718	return 0;
3719}
3720
3721
3722
3723static int tgsi_ssg(struct r600_shader_ctx *ctx)
3724{
3725	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3726	struct r600_bytecode_alu alu;
3727	int i, r;
3728
3729	/* tmp = (src > 0 ? 1 : src) */
3730	for (i = 0; i < 4; i++) {
3731		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3732		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3733		alu.is_op3 = 1;
3734
3735		alu.dst.sel = ctx->temp_reg;
3736		alu.dst.chan = i;
3737
3738		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3739		alu.src[1].sel = V_SQ_ALU_SRC_1;
3740		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3741
3742		if (i == 3)
3743			alu.last = 1;
3744		r = r600_bytecode_add_alu(ctx->bc, &alu);
3745		if (r)
3746			return r;
3747	}
3748
3749	/* dst = (-tmp > 0 ? -1 : tmp) */
3750	for (i = 0; i < 4; i++) {
3751		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3752		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3753		alu.is_op3 = 1;
3754		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3755
3756		alu.src[0].sel = ctx->temp_reg;
3757		alu.src[0].chan = i;
3758		alu.src[0].neg = 1;
3759
3760		alu.src[1].sel = V_SQ_ALU_SRC_1;
3761		alu.src[1].neg = 1;
3762
3763		alu.src[2].sel = ctx->temp_reg;
3764		alu.src[2].chan = i;
3765
3766		if (i == 3)
3767			alu.last = 1;
3768		r = r600_bytecode_add_alu(ctx->bc, &alu);
3769		if (r)
3770			return r;
3771	}
3772	return 0;
3773}
3774
3775static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3776{
3777	struct r600_bytecode_alu alu;
3778	int i, r;
3779
3780	for (i = 0; i < 4; i++) {
3781		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3782		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3783			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3784			alu.dst.chan = i;
3785		} else {
3786			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3787			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3788			alu.src[0].sel = ctx->temp_reg;
3789			alu.src[0].chan = i;
3790		}
3791		if (i == 3) {
3792			alu.last = 1;
3793		}
3794		r = r600_bytecode_add_alu(ctx->bc, &alu);
3795		if (r)
3796			return r;
3797	}
3798	return 0;
3799}
3800
3801static int tgsi_op3(struct r600_shader_ctx *ctx)
3802{
3803	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3804	struct r600_bytecode_alu alu;
3805	int i, j, r;
3806	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3807
3808	for (i = 0; i < lasti + 1; i++) {
3809		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3810			continue;
3811
3812		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3813		alu.inst = ctx->inst_info->r600_opcode;
3814		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3815			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3816		}
3817
3818		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3819		alu.dst.chan = i;
3820		alu.dst.write = 1;
3821		alu.is_op3 = 1;
3822		if (i == lasti) {
3823			alu.last = 1;
3824		}
3825		r = r600_bytecode_add_alu(ctx->bc, &alu);
3826		if (r)
3827			return r;
3828	}
3829	return 0;
3830}
3831
3832static int tgsi_dp(struct r600_shader_ctx *ctx)
3833{
3834	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3835	struct r600_bytecode_alu alu;
3836	int i, j, r;
3837
3838	for (i = 0; i < 4; i++) {
3839		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3840		alu.inst = ctx->inst_info->r600_opcode;
3841		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3842			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3843		}
3844
3845		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3846		alu.dst.chan = i;
3847		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3848		/* handle some special cases */
3849		switch (ctx->inst_info->tgsi_opcode) {
3850		case TGSI_OPCODE_DP2:
3851			if (i > 1) {
3852				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3853				alu.src[0].chan = alu.src[1].chan = 0;
3854			}
3855			break;
3856		case TGSI_OPCODE_DP3:
3857			if (i > 2) {
3858				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3859				alu.src[0].chan = alu.src[1].chan = 0;
3860			}
3861			break;
3862		case TGSI_OPCODE_DPH:
3863			if (i == 3) {
3864				alu.src[0].sel = V_SQ_ALU_SRC_1;
3865				alu.src[0].chan = 0;
3866				alu.src[0].neg = 0;
3867			}
3868			break;
3869		default:
3870			break;
3871		}
3872		if (i == 3) {
3873			alu.last = 1;
3874		}
3875		r = r600_bytecode_add_alu(ctx->bc, &alu);
3876		if (r)
3877			return r;
3878	}
3879	return 0;
3880}
3881
3882static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3883						    unsigned index)
3884{
3885	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3886	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3887		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3888		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3889		ctx->src[index].neg || ctx->src[index].abs;
3890}
3891
3892static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3893					unsigned index)
3894{
3895	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3896	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3897}
3898
3899static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
3900{
3901	struct r600_bytecode_vtx vtx;
3902	struct r600_bytecode_alu alu;
3903	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3904	int src_gpr, r, i;
3905	int id = tgsi_tex_get_src_gpr(ctx, 1);
3906
3907	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3908	if (src_requires_loading) {
3909		for (i = 0; i < 4; i++) {
3910			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3911			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3912			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3913			alu.dst.sel = ctx->temp_reg;
3914			alu.dst.chan = i;
3915			if (i == 3)
3916				alu.last = 1;
3917			alu.dst.write = 1;
3918			r = r600_bytecode_add_alu(ctx->bc, &alu);
3919			if (r)
3920				return r;
3921		}
3922		src_gpr = ctx->temp_reg;
3923	}
3924
3925	memset(&vtx, 0, sizeof(vtx));
3926	vtx.inst = 0;
3927	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
3928	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
3929	vtx.src_gpr = src_gpr;
3930	vtx.mega_fetch_count = 16;
3931	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
3932	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
3933	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
3934	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
3935	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
3936	vtx.use_const_fields = 1;
3937	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
3938
3939	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
3940		return r;
3941
3942	if (ctx->bc->chip_class >= EVERGREEN)
3943		return 0;
3944
3945	for (i = 0; i < 4; i++) {
3946		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3947		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3948			continue;
3949
3950		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3951		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3952
3953		alu.dst.chan = i;
3954		alu.dst.sel = vtx.dst_gpr;
3955		alu.dst.write = 1;
3956
3957		alu.src[0].sel = vtx.dst_gpr;
3958		alu.src[0].chan = i;
3959
3960		alu.src[1].sel = 512 + (id * 2);
3961		alu.src[1].chan = i % 4;
3962		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3963
3964		if (i == lasti)
3965			alu.last = 1;
3966		r = r600_bytecode_add_alu(ctx->bc, &alu);
3967		if (r)
3968			return r;
3969	}
3970
3971	if (inst->Dst[0].Register.WriteMask & 3) {
3972		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3973		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT);
3974
3975		alu.dst.chan = 3;
3976		alu.dst.sel = vtx.dst_gpr;
3977		alu.dst.write = 1;
3978
3979		alu.src[0].sel = vtx.dst_gpr;
3980		alu.src[0].chan = 3;
3981
3982		alu.src[1].sel = 512 + (id * 2) + 1;
3983		alu.src[1].chan = 0;
3984		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3985
3986		alu.last = 1;
3987		r = r600_bytecode_add_alu(ctx->bc, &alu);
3988		if (r)
3989			return r;
3990	}
3991	return 0;
3992}
3993
3994static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
3995{
3996	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3997	struct r600_bytecode_alu alu;
3998	int r;
3999	int id = tgsi_tex_get_src_gpr(ctx, 1);
4000
4001	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4002	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4003
4004	if (ctx->bc->chip_class >= EVERGREEN) {
4005		alu.src[0].sel = 512 + (id / 4);
4006		alu.src[0].chan = id % 4;
4007	} else {
4008		/* r600 we have them at channel 2 of the second dword */
4009		alu.src[0].sel = 512 + (id * 2) + 1;
4010		alu.src[0].chan = 1;
4011	}
4012	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4013	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4014	alu.last = 1;
4015	r = r600_bytecode_add_alu(ctx->bc, &alu);
4016	if (r)
4017		return r;
4018	return 0;
4019}
4020
4021static int tgsi_tex(struct r600_shader_ctx *ctx)
4022{
4023	static float one_point_five = 1.5f;
4024	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4025	struct r600_bytecode_tex tex;
4026	struct r600_bytecode_alu alu;
4027	unsigned src_gpr;
4028	int r, i, j;
4029	int opcode;
4030	bool read_compressed_msaa = ctx->bc->msaa_texture_mode == MSAA_TEXTURE_COMPRESSED &&
4031				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4032				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
4033				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
4034	/* Texture fetch instructions can only use gprs as source.
4035	 * Also they cannot negate the source or take the absolute value */
4036	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
4037                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
4038					     read_compressed_msaa;
4039	boolean src_loaded = FALSE;
4040	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
4041	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
4042	boolean has_txq_cube_array_z = false;
4043
4044	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
4045	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4046	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
4047		if (inst->Dst[0].Register.WriteMask & 4) {
4048			ctx->shader->has_txq_cube_array_z_comp = true;
4049			has_txq_cube_array_z = true;
4050		}
4051
4052	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
4053	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4054	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4055		sampler_src_reg = 2;
4056
4057	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4058
4059	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
4060		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
4061			ctx->shader->uses_tex_buffers = true;
4062			return r600_do_buffer_txq(ctx);
4063		}
4064		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4065			if (ctx->bc->chip_class < EVERGREEN)
4066				ctx->shader->uses_tex_buffers = true;
4067			return do_vtx_fetch_inst(ctx, src_requires_loading);
4068		}
4069	}
4070
4071	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4072		/* get offset values */
4073		if (inst->Texture.NumOffsets) {
4074			assert(inst->Texture.NumOffsets == 1);
4075
4076			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
4077			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
4078			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
4079		}
4080	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
4081		/* TGSI moves the sampler to src reg 3 for TXD */
4082		sampler_src_reg = 3;
4083
4084		for (i = 1; i < 3; i++) {
4085			/* set gradients h/v */
4086			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4087			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
4088				SQ_TEX_INST_SET_GRADIENTS_V;
4089			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4090			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4091
4092			if (tgsi_tex_src_requires_loading(ctx, i)) {
4093				tex.src_gpr = r600_get_temp(ctx);
4094				tex.src_sel_x = 0;
4095				tex.src_sel_y = 1;
4096				tex.src_sel_z = 2;
4097				tex.src_sel_w = 3;
4098
4099				for (j = 0; j < 4; j++) {
4100					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4101					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4102                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
4103                                        alu.dst.sel = tex.src_gpr;
4104                                        alu.dst.chan = j;
4105                                        if (j == 3)
4106                                                alu.last = 1;
4107                                        alu.dst.write = 1;
4108                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
4109                                        if (r)
4110                                                return r;
4111				}
4112
4113			} else {
4114				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
4115				tex.src_sel_x = ctx->src[i].swizzle[0];
4116				tex.src_sel_y = ctx->src[i].swizzle[1];
4117				tex.src_sel_z = ctx->src[i].swizzle[2];
4118				tex.src_sel_w = ctx->src[i].swizzle[3];
4119				tex.src_rel = ctx->src[i].rel;
4120			}
4121			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
4122			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4123			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
4124				tex.coord_type_x = 1;
4125				tex.coord_type_y = 1;
4126				tex.coord_type_z = 1;
4127				tex.coord_type_w = 1;
4128			}
4129			r = r600_bytecode_add_tex(ctx->bc, &tex);
4130			if (r)
4131				return r;
4132		}
4133	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
4134		int out_chan;
4135		/* Add perspective divide */
4136		if (ctx->bc->chip_class == CAYMAN) {
4137			out_chan = 2;
4138			for (i = 0; i < 3; i++) {
4139				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4140				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4141				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4142
4143				alu.dst.sel = ctx->temp_reg;
4144				alu.dst.chan = i;
4145				if (i == 2)
4146					alu.last = 1;
4147				if (out_chan == i)
4148					alu.dst.write = 1;
4149				r = r600_bytecode_add_alu(ctx->bc, &alu);
4150				if (r)
4151					return r;
4152			}
4153
4154		} else {
4155			out_chan = 3;
4156			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4157			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4158			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4159
4160			alu.dst.sel = ctx->temp_reg;
4161			alu.dst.chan = out_chan;
4162			alu.last = 1;
4163			alu.dst.write = 1;
4164			r = r600_bytecode_add_alu(ctx->bc, &alu);
4165			if (r)
4166				return r;
4167		}
4168
4169		for (i = 0; i < 3; i++) {
4170			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4171			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4172			alu.src[0].sel = ctx->temp_reg;
4173			alu.src[0].chan = out_chan;
4174			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4175			alu.dst.sel = ctx->temp_reg;
4176			alu.dst.chan = i;
4177			alu.dst.write = 1;
4178			r = r600_bytecode_add_alu(ctx->bc, &alu);
4179			if (r)
4180				return r;
4181		}
4182		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4183		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4184		alu.src[0].sel = V_SQ_ALU_SRC_1;
4185		alu.src[0].chan = 0;
4186		alu.dst.sel = ctx->temp_reg;
4187		alu.dst.chan = 3;
4188		alu.last = 1;
4189		alu.dst.write = 1;
4190		r = r600_bytecode_add_alu(ctx->bc, &alu);
4191		if (r)
4192			return r;
4193		src_loaded = TRUE;
4194		src_gpr = ctx->temp_reg;
4195	}
4196
4197	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4198	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4199	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4200	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4201	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
4202	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
4203
4204		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
4205		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
4206
4207		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
4208		for (i = 0; i < 4; i++) {
4209			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4210			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
4211			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4212			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
4213			alu.dst.sel = ctx->temp_reg;
4214			alu.dst.chan = i;
4215			if (i == 3)
4216				alu.last = 1;
4217			alu.dst.write = 1;
4218			r = r600_bytecode_add_alu(ctx->bc, &alu);
4219			if (r)
4220				return r;
4221		}
4222
4223		/* tmp1.z = RCP_e(|tmp1.z|) */
4224		if (ctx->bc->chip_class == CAYMAN) {
4225			for (i = 0; i < 3; i++) {
4226				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4227				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4228				alu.src[0].sel = ctx->temp_reg;
4229				alu.src[0].chan = 2;
4230				alu.src[0].abs = 1;
4231				alu.dst.sel = ctx->temp_reg;
4232				alu.dst.chan = i;
4233				if (i == 2)
4234					alu.dst.write = 1;
4235				if (i == 2)
4236					alu.last = 1;
4237				r = r600_bytecode_add_alu(ctx->bc, &alu);
4238				if (r)
4239					return r;
4240			}
4241		} else {
4242			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4243			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4244			alu.src[0].sel = ctx->temp_reg;
4245			alu.src[0].chan = 2;
4246			alu.src[0].abs = 1;
4247			alu.dst.sel = ctx->temp_reg;
4248			alu.dst.chan = 2;
4249			alu.dst.write = 1;
4250			alu.last = 1;
4251			r = r600_bytecode_add_alu(ctx->bc, &alu);
4252			if (r)
4253				return r;
4254		}
4255
4256		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
4257		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
4258		 * muladd has no writemask, have to use another temp
4259		 */
4260		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4261		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4262		alu.is_op3 = 1;
4263
4264		alu.src[0].sel = ctx->temp_reg;
4265		alu.src[0].chan = 0;
4266		alu.src[1].sel = ctx->temp_reg;
4267		alu.src[1].chan = 2;
4268
4269		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4270		alu.src[2].chan = 0;
4271		alu.src[2].value = *(uint32_t *)&one_point_five;
4272
4273		alu.dst.sel = ctx->temp_reg;
4274		alu.dst.chan = 0;
4275		alu.dst.write = 1;
4276
4277		r = r600_bytecode_add_alu(ctx->bc, &alu);
4278		if (r)
4279			return r;
4280
4281		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4282		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4283		alu.is_op3 = 1;
4284
4285		alu.src[0].sel = ctx->temp_reg;
4286		alu.src[0].chan = 1;
4287		alu.src[1].sel = ctx->temp_reg;
4288		alu.src[1].chan = 2;
4289
4290		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4291		alu.src[2].chan = 0;
4292		alu.src[2].value = *(uint32_t *)&one_point_five;
4293
4294		alu.dst.sel = ctx->temp_reg;
4295		alu.dst.chan = 1;
4296		alu.dst.write = 1;
4297
4298		alu.last = 1;
4299		r = r600_bytecode_add_alu(ctx->bc, &alu);
4300		if (r)
4301			return r;
4302		/* write initial compare value into Z component
4303		  - W src 0 for shadow cube
4304		  - X src 1 for shadow cube array */
4305		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4306		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4307			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4308			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4309			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4310				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4311			else
4312				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4313			alu.dst.sel = ctx->temp_reg;
4314			alu.dst.chan = 2;
4315			alu.dst.write = 1;
4316			alu.last = 1;
4317			r = r600_bytecode_add_alu(ctx->bc, &alu);
4318			if (r)
4319				return r;
4320		}
4321
4322		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4323		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4324			if (ctx->bc->chip_class >= EVERGREEN) {
4325				int mytmp = r600_get_temp(ctx);
4326				static const float eight = 8.0f;
4327				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4328				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4329				alu.src[0].sel = ctx->temp_reg;
4330				alu.src[0].chan = 3;
4331				alu.dst.sel = mytmp;
4332				alu.dst.chan = 0;
4333				alu.dst.write = 1;
4334				alu.last = 1;
4335				r = r600_bytecode_add_alu(ctx->bc, &alu);
4336				if (r)
4337					return r;
4338
4339				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
4340				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4341				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4342				alu.is_op3 = 1;
4343				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4344				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4345				alu.src[1].chan = 0;
4346				alu.src[1].value = *(uint32_t *)&eight;
4347				alu.src[2].sel = mytmp;
4348				alu.src[2].chan = 0;
4349				alu.dst.sel = ctx->temp_reg;
4350				alu.dst.chan = 3;
4351				alu.dst.write = 1;
4352				alu.last = 1;
4353				r = r600_bytecode_add_alu(ctx->bc, &alu);
4354				if (r)
4355					return r;
4356			} else if (ctx->bc->chip_class < EVERGREEN) {
4357				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4358				tex.inst = SQ_TEX_INST_SET_CUBEMAP_INDEX;
4359				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4360				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4361				tex.src_gpr = r600_get_temp(ctx);
4362				tex.src_sel_x = 0;
4363				tex.src_sel_y = 0;
4364				tex.src_sel_z = 0;
4365				tex.src_sel_w = 0;
4366				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4367				tex.coord_type_x = 1;
4368				tex.coord_type_y = 1;
4369				tex.coord_type_z = 1;
4370				tex.coord_type_w = 1;
4371				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4372				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4373				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4374				alu.dst.sel = tex.src_gpr;
4375				alu.dst.chan = 0;
4376				alu.last = 1;
4377				alu.dst.write = 1;
4378				r = r600_bytecode_add_alu(ctx->bc, &alu);
4379				if (r)
4380					return r;
4381
4382				r = r600_bytecode_add_tex(ctx->bc, &tex);
4383				if (r)
4384					return r;
4385			}
4386
4387		}
4388
4389		/* for cube forms of lod and bias we need to route things */
4390		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
4391		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
4392		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4393		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
4394			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4395			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4396			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4397			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4398				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4399			else
4400				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4401			alu.dst.sel = ctx->temp_reg;
4402			alu.dst.chan = 2;
4403			alu.last = 1;
4404			alu.dst.write = 1;
4405			r = r600_bytecode_add_alu(ctx->bc, &alu);
4406			if (r)
4407				return r;
4408		}
4409
4410		src_loaded = TRUE;
4411		src_gpr = ctx->temp_reg;
4412	}
4413
4414	if (src_requires_loading && !src_loaded) {
4415		for (i = 0; i < 4; i++) {
4416			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4417			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4418			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4419			alu.dst.sel = ctx->temp_reg;
4420			alu.dst.chan = i;
4421			if (i == 3)
4422				alu.last = 1;
4423			alu.dst.write = 1;
4424			r = r600_bytecode_add_alu(ctx->bc, &alu);
4425			if (r)
4426				return r;
4427		}
4428		src_loaded = TRUE;
4429		src_gpr = ctx->temp_reg;
4430	}
4431
4432	/* Obtain the sample index for reading a compressed MSAA color texture.
4433	 * To read the FMASK, we use the ldfptr instruction, which tells us
4434	 * where the samples are stored.
4435	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
4436	 * which is the identity mapping. Each nibble says which physical sample
4437	 * should be fetched to get that sample.
4438	 *
4439	 * Assume src.z contains the sample index. It should be modified like this:
4440	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
4441	 * Then fetch the texel with src.
4442	 */
4443	if (read_compressed_msaa) {
4444		unsigned sample_chan = inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ? 3 : 4;
4445		unsigned temp = r600_get_temp(ctx);
4446		assert(src_loaded);
4447
4448		/* temp.w = ldfptr() */
4449		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4450		tex.inst = SQ_TEX_INST_LD;
4451		tex.inst_mod = 1; /* to indicate this is ldfptr */
4452		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4453		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4454		tex.src_gpr = src_gpr;
4455		tex.dst_gpr = temp;
4456		tex.dst_sel_x = 7; /* mask out these components */
4457		tex.dst_sel_y = 7;
4458		tex.dst_sel_z = 7;
4459		tex.dst_sel_w = 0; /* store X */
4460		tex.src_sel_x = 0;
4461		tex.src_sel_y = 1;
4462		tex.src_sel_z = 2;
4463		tex.src_sel_w = 3;
4464		tex.offset_x = offset_x;
4465		tex.offset_y = offset_y;
4466		tex.offset_z = offset_z;
4467		r = r600_bytecode_add_tex(ctx->bc, &tex);
4468		if (r)
4469			return r;
4470
4471		/* temp.x = sample_index*4 */
4472		if (ctx->bc->chip_class == CAYMAN) {
4473			for (i = 0 ; i < 4; i++) {
4474				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4475				alu.inst = ctx->inst_info->r600_opcode;
4476				alu.src[0].sel = src_gpr;
4477				alu.src[0].chan = sample_chan;
4478				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4479				alu.src[1].value = 4;
4480				alu.dst.sel = temp;
4481				alu.dst.chan = i;
4482				alu.dst.write = i == 0;
4483				if (i == 3)
4484					alu.last = 1;
4485				r = r600_bytecode_add_alu(ctx->bc, &alu);
4486				if (r)
4487					return r;
4488			}
4489		} else {
4490			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4491			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT);
4492			alu.src[0].sel = src_gpr;
4493			alu.src[0].chan = sample_chan;
4494			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4495			alu.src[1].value = 4;
4496			alu.dst.sel = temp;
4497			alu.dst.chan = 0;
4498			alu.dst.write = 1;
4499			alu.last = 1;
4500			r = r600_bytecode_add_alu(ctx->bc, &alu);
4501			if (r)
4502				return r;
4503		}
4504
4505		/* sample_index = temp.w >> temp.x */
4506		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4507		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT);
4508		alu.src[0].sel = temp;
4509		alu.src[0].chan = 3;
4510		alu.src[1].sel = temp;
4511		alu.src[1].chan = 0;
4512		alu.dst.sel = src_gpr;
4513		alu.dst.chan = sample_chan;
4514		alu.dst.write = 1;
4515		alu.last = 1;
4516		r = r600_bytecode_add_alu(ctx->bc, &alu);
4517		if (r)
4518			return r;
4519
4520		/* sample_index & 0xF */
4521		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4522		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
4523		alu.src[0].sel = src_gpr;
4524		alu.src[0].chan = sample_chan;
4525		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4526		alu.src[1].value = 0xF;
4527		alu.dst.sel = src_gpr;
4528		alu.dst.chan = sample_chan;
4529		alu.dst.write = 1;
4530		alu.last = 1;
4531		r = r600_bytecode_add_alu(ctx->bc, &alu);
4532		if (r)
4533			return r;
4534#if 0
4535		/* visualize the FMASK */
4536		for (i = 0; i < 4; i++) {
4537			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4538			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
4539			alu.src[0].sel = src_gpr;
4540			alu.src[0].chan = sample_chan;
4541			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4542			alu.dst.chan = i;
4543			alu.dst.write = 1;
4544			alu.last = 1;
4545			r = r600_bytecode_add_alu(ctx->bc, &alu);
4546			if (r)
4547				return r;
4548		}
4549		return 0;
4550#endif
4551	}
4552
4553	/* does this shader want a num layers from TXQ for a cube array? */
4554	if (has_txq_cube_array_z) {
4555		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4556
4557		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4558		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4559
4560		alu.src[0].sel = 512 + (id / 4);
4561		alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER;
4562		alu.src[0].chan = id % 4;
4563		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4564		alu.last = 1;
4565		r = r600_bytecode_add_alu(ctx->bc, &alu);
4566		if (r)
4567			return r;
4568		/* disable writemask from texture instruction */
4569		inst->Dst[0].Register.WriteMask &= ~4;
4570	}
4571
4572	opcode = ctx->inst_info->r600_opcode;
4573	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4574	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4575	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4576	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4577	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4578	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
4579	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4580		switch (opcode) {
4581		case SQ_TEX_INST_SAMPLE:
4582			opcode = SQ_TEX_INST_SAMPLE_C;
4583			break;
4584		case SQ_TEX_INST_SAMPLE_L:
4585			opcode = SQ_TEX_INST_SAMPLE_C_L;
4586			break;
4587		case SQ_TEX_INST_SAMPLE_LB:
4588			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4589			break;
4590		case SQ_TEX_INST_SAMPLE_G:
4591			opcode = SQ_TEX_INST_SAMPLE_C_G;
4592			break;
4593		}
4594	}
4595
4596	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4597	tex.inst = opcode;
4598
4599	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4600	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4601	tex.src_gpr = src_gpr;
4602	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4603	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4604	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4605	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4606	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4607
4608	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4609		tex.src_sel_x = 4;
4610		tex.src_sel_y = 4;
4611		tex.src_sel_z = 4;
4612		tex.src_sel_w = 4;
4613	} else if (src_loaded) {
4614		tex.src_sel_x = 0;
4615		tex.src_sel_y = 1;
4616		tex.src_sel_z = 2;
4617		tex.src_sel_w = 3;
4618	} else {
4619		tex.src_sel_x = ctx->src[0].swizzle[0];
4620		tex.src_sel_y = ctx->src[0].swizzle[1];
4621		tex.src_sel_z = ctx->src[0].swizzle[2];
4622		tex.src_sel_w = ctx->src[0].swizzle[3];
4623		tex.src_rel = ctx->src[0].rel;
4624	}
4625
4626	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4627	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4628	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4629	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4630		tex.src_sel_x = 1;
4631		tex.src_sel_y = 0;
4632		tex.src_sel_z = 3;
4633		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
4634	}
4635
4636	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4637	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4638		tex.coord_type_x = 1;
4639		tex.coord_type_y = 1;
4640	}
4641	tex.coord_type_z = 1;
4642	tex.coord_type_w = 1;
4643
4644	tex.offset_x = offset_x;
4645	tex.offset_y = offset_y;
4646	tex.offset_z = offset_z;
4647
4648	/* Put the depth for comparison in W.
4649	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4650	 * Some instructions expect the depth in Z. */
4651	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4652	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4653	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4654	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4655	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4656	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4657		tex.src_sel_w = tex.src_sel_z;
4658	}
4659
4660	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4661	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4662		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4663		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4664			/* the array index is read from Y */
4665			tex.coord_type_y = 0;
4666		} else {
4667			/* the array index is read from Z */
4668			tex.coord_type_z = 0;
4669			tex.src_sel_z = tex.src_sel_y;
4670		}
4671	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4672		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
4673		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4674		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4675		    (ctx->bc->chip_class >= EVERGREEN)))
4676		/* the array index is read from Z */
4677		tex.coord_type_z = 0;
4678
4679	r = r600_bytecode_add_tex(ctx->bc, &tex);
4680	if (r)
4681		return r;
4682
4683	/* add shadow ambient support  - gallium doesn't do it yet */
4684	return 0;
4685}
4686
4687static int tgsi_lrp(struct r600_shader_ctx *ctx)
4688{
4689	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4690	struct r600_bytecode_alu alu;
4691	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4692	unsigned i;
4693	int r;
4694
4695	/* optimize if it's just an equal balance */
4696	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4697		for (i = 0; i < lasti + 1; i++) {
4698			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4699				continue;
4700
4701			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4702			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4703			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4704			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4705			alu.omod = 3;
4706			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4707			alu.dst.chan = i;
4708			if (i == lasti) {
4709				alu.last = 1;
4710			}
4711			r = r600_bytecode_add_alu(ctx->bc, &alu);
4712			if (r)
4713				return r;
4714		}
4715		return 0;
4716	}
4717
4718	/* 1 - src0 */
4719	for (i = 0; i < lasti + 1; i++) {
4720		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4721			continue;
4722
4723		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4724		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4725		alu.src[0].sel = V_SQ_ALU_SRC_1;
4726		alu.src[0].chan = 0;
4727		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4728		r600_bytecode_src_toggle_neg(&alu.src[1]);
4729		alu.dst.sel = ctx->temp_reg;
4730		alu.dst.chan = i;
4731		if (i == lasti) {
4732			alu.last = 1;
4733		}
4734		alu.dst.write = 1;
4735		r = r600_bytecode_add_alu(ctx->bc, &alu);
4736		if (r)
4737			return r;
4738	}
4739
4740	/* (1 - src0) * src2 */
4741	for (i = 0; i < lasti + 1; i++) {
4742		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4743			continue;
4744
4745		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4746		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4747		alu.src[0].sel = ctx->temp_reg;
4748		alu.src[0].chan = i;
4749		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4750		alu.dst.sel = ctx->temp_reg;
4751		alu.dst.chan = i;
4752		if (i == lasti) {
4753			alu.last = 1;
4754		}
4755		alu.dst.write = 1;
4756		r = r600_bytecode_add_alu(ctx->bc, &alu);
4757		if (r)
4758			return r;
4759	}
4760
4761	/* src0 * src1 + (1 - src0) * src2 */
4762	for (i = 0; i < lasti + 1; i++) {
4763		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4764			continue;
4765
4766		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4767		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4768		alu.is_op3 = 1;
4769		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4770		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4771		alu.src[2].sel = ctx->temp_reg;
4772		alu.src[2].chan = i;
4773
4774		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4775		alu.dst.chan = i;
4776		if (i == lasti) {
4777			alu.last = 1;
4778		}
4779		r = r600_bytecode_add_alu(ctx->bc, &alu);
4780		if (r)
4781			return r;
4782	}
4783	return 0;
4784}
4785
4786static int tgsi_cmp(struct r600_shader_ctx *ctx)
4787{
4788	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4789	struct r600_bytecode_alu alu;
4790	int i, r;
4791	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4792
4793	for (i = 0; i < lasti + 1; i++) {
4794		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4795			continue;
4796
4797		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4798		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4799		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4800		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4801		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4802		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4803		alu.dst.chan = i;
4804		alu.dst.write = 1;
4805		alu.is_op3 = 1;
4806		if (i == lasti)
4807			alu.last = 1;
4808		r = r600_bytecode_add_alu(ctx->bc, &alu);
4809		if (r)
4810			return r;
4811	}
4812	return 0;
4813}
4814
4815static int tgsi_ucmp(struct r600_shader_ctx *ctx)
4816{
4817	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4818	struct r600_bytecode_alu alu;
4819	int i, r;
4820	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4821
4822	for (i = 0; i < lasti + 1; i++) {
4823		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4824			continue;
4825
4826		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4827		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
4828		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4829		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4830		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4831		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4832		alu.dst.chan = i;
4833		alu.dst.write = 1;
4834		alu.is_op3 = 1;
4835		if (i == lasti)
4836			alu.last = 1;
4837		r = r600_bytecode_add_alu(ctx->bc, &alu);
4838		if (r)
4839			return r;
4840	}
4841	return 0;
4842}
4843
4844static int tgsi_xpd(struct r600_shader_ctx *ctx)
4845{
4846	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4847	static const unsigned int src0_swizzle[] = {2, 0, 1};
4848	static const unsigned int src1_swizzle[] = {1, 2, 0};
4849	struct r600_bytecode_alu alu;
4850	uint32_t use_temp = 0;
4851	int i, r;
4852
4853	if (inst->Dst[0].Register.WriteMask != 0xf)
4854		use_temp = 1;
4855
4856	for (i = 0; i < 4; i++) {
4857		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4858		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4859		if (i < 3) {
4860			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4861			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4862		} else {
4863			alu.src[0].sel = V_SQ_ALU_SRC_0;
4864			alu.src[0].chan = i;
4865			alu.src[1].sel = V_SQ_ALU_SRC_0;
4866			alu.src[1].chan = i;
4867		}
4868
4869		alu.dst.sel = ctx->temp_reg;
4870		alu.dst.chan = i;
4871		alu.dst.write = 1;
4872
4873		if (i == 3)
4874			alu.last = 1;
4875		r = r600_bytecode_add_alu(ctx->bc, &alu);
4876		if (r)
4877			return r;
4878	}
4879
4880	for (i = 0; i < 4; i++) {
4881		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4882		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4883
4884		if (i < 3) {
4885			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4886			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4887		} else {
4888			alu.src[0].sel = V_SQ_ALU_SRC_0;
4889			alu.src[0].chan = i;
4890			alu.src[1].sel = V_SQ_ALU_SRC_0;
4891			alu.src[1].chan = i;
4892		}
4893
4894		alu.src[2].sel = ctx->temp_reg;
4895		alu.src[2].neg = 1;
4896		alu.src[2].chan = i;
4897
4898		if (use_temp)
4899			alu.dst.sel = ctx->temp_reg;
4900		else
4901			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4902		alu.dst.chan = i;
4903		alu.dst.write = 1;
4904		alu.is_op3 = 1;
4905		if (i == 3)
4906			alu.last = 1;
4907		r = r600_bytecode_add_alu(ctx->bc, &alu);
4908		if (r)
4909			return r;
4910	}
4911	if (use_temp)
4912		return tgsi_helper_copy(ctx, inst);
4913	return 0;
4914}
4915
4916static int tgsi_exp(struct r600_shader_ctx *ctx)
4917{
4918	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4919	struct r600_bytecode_alu alu;
4920	int r;
4921	int i;
4922
4923	/* result.x = 2^floor(src); */
4924	if (inst->Dst[0].Register.WriteMask & 1) {
4925		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4926
4927		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4928		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4929
4930		alu.dst.sel = ctx->temp_reg;
4931		alu.dst.chan = 0;
4932		alu.dst.write = 1;
4933		alu.last = 1;
4934		r = r600_bytecode_add_alu(ctx->bc, &alu);
4935		if (r)
4936			return r;
4937
4938		if (ctx->bc->chip_class == CAYMAN) {
4939			for (i = 0; i < 3; i++) {
4940				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4941				alu.src[0].sel = ctx->temp_reg;
4942				alu.src[0].chan = 0;
4943
4944				alu.dst.sel = ctx->temp_reg;
4945				alu.dst.chan = i;
4946				alu.dst.write = i == 0;
4947				alu.last = i == 2;
4948				r = r600_bytecode_add_alu(ctx->bc, &alu);
4949				if (r)
4950					return r;
4951			}
4952		} else {
4953			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4954			alu.src[0].sel = ctx->temp_reg;
4955			alu.src[0].chan = 0;
4956
4957			alu.dst.sel = ctx->temp_reg;
4958			alu.dst.chan = 0;
4959			alu.dst.write = 1;
4960			alu.last = 1;
4961			r = r600_bytecode_add_alu(ctx->bc, &alu);
4962			if (r)
4963				return r;
4964		}
4965	}
4966
4967	/* result.y = tmp - floor(tmp); */
4968	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4969		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4970
4971		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4972		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4973
4974		alu.dst.sel = ctx->temp_reg;
4975#if 0
4976		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4977		if (r)
4978			return r;
4979#endif
4980		alu.dst.write = 1;
4981		alu.dst.chan = 1;
4982
4983		alu.last = 1;
4984
4985		r = r600_bytecode_add_alu(ctx->bc, &alu);
4986		if (r)
4987			return r;
4988	}
4989
4990	/* result.z = RoughApprox2ToX(tmp);*/
4991	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4992		if (ctx->bc->chip_class == CAYMAN) {
4993			for (i = 0; i < 3; i++) {
4994				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4995				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4996				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4997
4998				alu.dst.sel = ctx->temp_reg;
4999				alu.dst.chan = i;
5000				if (i == 2) {
5001					alu.dst.write = 1;
5002					alu.last = 1;
5003				}
5004
5005				r = r600_bytecode_add_alu(ctx->bc, &alu);
5006				if (r)
5007					return r;
5008			}
5009		} else {
5010			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5011			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5012			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5013
5014			alu.dst.sel = ctx->temp_reg;
5015			alu.dst.write = 1;
5016			alu.dst.chan = 2;
5017
5018			alu.last = 1;
5019
5020			r = r600_bytecode_add_alu(ctx->bc, &alu);
5021			if (r)
5022				return r;
5023		}
5024	}
5025
5026	/* result.w = 1.0;*/
5027	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
5028		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5029
5030		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
5031		alu.src[0].sel = V_SQ_ALU_SRC_1;
5032		alu.src[0].chan = 0;
5033
5034		alu.dst.sel = ctx->temp_reg;
5035		alu.dst.chan = 3;
5036		alu.dst.write = 1;
5037		alu.last = 1;
5038		r = r600_bytecode_add_alu(ctx->bc, &alu);
5039		if (r)
5040			return r;
5041	}
5042	return tgsi_helper_copy(ctx, inst);
5043}
5044
5045static int tgsi_log(struct r600_shader_ctx *ctx)
5046{
5047	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5048	struct r600_bytecode_alu alu;
5049	int r;
5050	int i;
5051
5052	/* result.x = floor(log2(|src|)); */
5053	if (inst->Dst[0].Register.WriteMask & 1) {
5054		if (ctx->bc->chip_class == CAYMAN) {
5055			for (i = 0; i < 3; i++) {
5056				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5057
5058				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5059				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5060				r600_bytecode_src_set_abs(&alu.src[0]);
5061
5062				alu.dst.sel = ctx->temp_reg;
5063				alu.dst.chan = i;
5064				if (i == 0)
5065					alu.dst.write = 1;
5066				if (i == 2)
5067					alu.last = 1;
5068				r = r600_bytecode_add_alu(ctx->bc, &alu);
5069				if (r)
5070					return r;
5071			}
5072
5073		} else {
5074			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5075
5076			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5077			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5078			r600_bytecode_src_set_abs(&alu.src[0]);
5079
5080			alu.dst.sel = ctx->temp_reg;
5081			alu.dst.chan = 0;
5082			alu.dst.write = 1;
5083			alu.last = 1;
5084			r = r600_bytecode_add_alu(ctx->bc, &alu);
5085			if (r)
5086				return r;
5087		}
5088
5089		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
5090		alu.src[0].sel = ctx->temp_reg;
5091		alu.src[0].chan = 0;
5092
5093		alu.dst.sel = ctx->temp_reg;
5094		alu.dst.chan = 0;
5095		alu.dst.write = 1;
5096		alu.last = 1;
5097
5098		r = r600_bytecode_add_alu(ctx->bc, &alu);
5099		if (r)
5100			return r;
5101	}
5102
5103	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
5104	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5105
5106		if (ctx->bc->chip_class == CAYMAN) {
5107			for (i = 0; i < 3; i++) {
5108				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5109
5110				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5111				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5112				r600_bytecode_src_set_abs(&alu.src[0]);
5113
5114				alu.dst.sel = ctx->temp_reg;
5115				alu.dst.chan = i;
5116				if (i == 1)
5117					alu.dst.write = 1;
5118				if (i == 2)
5119					alu.last = 1;
5120
5121				r = r600_bytecode_add_alu(ctx->bc, &alu);
5122				if (r)
5123					return r;
5124			}
5125		} else {
5126			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5127
5128			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5129			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5130			r600_bytecode_src_set_abs(&alu.src[0]);
5131
5132			alu.dst.sel = ctx->temp_reg;
5133			alu.dst.chan = 1;
5134			alu.dst.write = 1;
5135			alu.last = 1;
5136
5137			r = r600_bytecode_add_alu(ctx->bc, &alu);
5138			if (r)
5139				return r;
5140		}
5141
5142		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5143
5144		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
5145		alu.src[0].sel = ctx->temp_reg;
5146		alu.src[0].chan = 1;
5147
5148		alu.dst.sel = ctx->temp_reg;
5149		alu.dst.chan = 1;
5150		alu.dst.write = 1;
5151		alu.last = 1;
5152
5153		r = r600_bytecode_add_alu(ctx->bc, &alu);
5154		if (r)
5155			return r;
5156
5157		if (ctx->bc->chip_class == CAYMAN) {
5158			for (i = 0; i < 3; i++) {
5159				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5160				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5161				alu.src[0].sel = ctx->temp_reg;
5162				alu.src[0].chan = 1;
5163
5164				alu.dst.sel = ctx->temp_reg;
5165				alu.dst.chan = i;
5166				if (i == 1)
5167					alu.dst.write = 1;
5168				if (i == 2)
5169					alu.last = 1;
5170
5171				r = r600_bytecode_add_alu(ctx->bc, &alu);
5172				if (r)
5173					return r;
5174			}
5175		} else {
5176			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5177			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
5178			alu.src[0].sel = ctx->temp_reg;
5179			alu.src[0].chan = 1;
5180
5181			alu.dst.sel = ctx->temp_reg;
5182			alu.dst.chan = 1;
5183			alu.dst.write = 1;
5184			alu.last = 1;
5185
5186			r = r600_bytecode_add_alu(ctx->bc, &alu);
5187			if (r)
5188				return r;
5189		}
5190
5191		if (ctx->bc->chip_class == CAYMAN) {
5192			for (i = 0; i < 3; i++) {
5193				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5194				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
5195				alu.src[0].sel = ctx->temp_reg;
5196				alu.src[0].chan = 1;
5197
5198				alu.dst.sel = ctx->temp_reg;
5199				alu.dst.chan = i;
5200				if (i == 1)
5201					alu.dst.write = 1;
5202				if (i == 2)
5203					alu.last = 1;
5204
5205				r = r600_bytecode_add_alu(ctx->bc, &alu);
5206				if (r)
5207					return r;
5208			}
5209		} else {
5210			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5211			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
5212			alu.src[0].sel = ctx->temp_reg;
5213			alu.src[0].chan = 1;
5214
5215			alu.dst.sel = ctx->temp_reg;
5216			alu.dst.chan = 1;
5217			alu.dst.write = 1;
5218			alu.last = 1;
5219
5220			r = r600_bytecode_add_alu(ctx->bc, &alu);
5221			if (r)
5222				return r;
5223		}
5224
5225		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5226
5227		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
5228
5229		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5230		r600_bytecode_src_set_abs(&alu.src[0]);
5231
5232		alu.src[1].sel = ctx->temp_reg;
5233		alu.src[1].chan = 1;
5234
5235		alu.dst.sel = ctx->temp_reg;
5236		alu.dst.chan = 1;
5237		alu.dst.write = 1;
5238		alu.last = 1;
5239
5240		r = r600_bytecode_add_alu(ctx->bc, &alu);
5241		if (r)
5242			return r;
5243	}
5244
5245	/* result.z = log2(|src|);*/
5246	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
5247		if (ctx->bc->chip_class == CAYMAN) {
5248			for (i = 0; i < 3; i++) {
5249				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5250
5251				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5252				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5253				r600_bytecode_src_set_abs(&alu.src[0]);
5254
5255				alu.dst.sel = ctx->temp_reg;
5256				if (i == 2)
5257					alu.dst.write = 1;
5258				alu.dst.chan = i;
5259				if (i == 2)
5260					alu.last = 1;
5261
5262				r = r600_bytecode_add_alu(ctx->bc, &alu);
5263				if (r)
5264					return r;
5265			}
5266		} else {
5267			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5268
5269			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
5270			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5271			r600_bytecode_src_set_abs(&alu.src[0]);
5272
5273			alu.dst.sel = ctx->temp_reg;
5274			alu.dst.write = 1;
5275			alu.dst.chan = 2;
5276			alu.last = 1;
5277
5278			r = r600_bytecode_add_alu(ctx->bc, &alu);
5279			if (r)
5280				return r;
5281		}
5282	}
5283
5284	/* result.w = 1.0; */
5285	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
5286		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5287
5288		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
5289		alu.src[0].sel = V_SQ_ALU_SRC_1;
5290		alu.src[0].chan = 0;
5291
5292		alu.dst.sel = ctx->temp_reg;
5293		alu.dst.chan = 3;
5294		alu.dst.write = 1;
5295		alu.last = 1;
5296
5297		r = r600_bytecode_add_alu(ctx->bc, &alu);
5298		if (r)
5299			return r;
5300	}
5301
5302	return tgsi_helper_copy(ctx, inst);
5303}
5304
5305static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
5306{
5307	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5308	struct r600_bytecode_alu alu;
5309	int r;
5310
5311	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5312
5313	switch (inst->Instruction.Opcode) {
5314	case TGSI_OPCODE_ARL:
5315		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
5316		break;
5317	case TGSI_OPCODE_ARR:
5318		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5319		break;
5320	case TGSI_OPCODE_UARL:
5321		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
5322		break;
5323	default:
5324		assert(0);
5325		return -1;
5326	}
5327
5328	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5329	alu.last = 1;
5330	alu.dst.sel = ctx->bc->ar_reg;
5331	alu.dst.write = 1;
5332	r = r600_bytecode_add_alu(ctx->bc, &alu);
5333	if (r)
5334		return r;
5335
5336	ctx->bc->ar_loaded = 0;
5337	return 0;
5338}
5339static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
5340{
5341	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5342	struct r600_bytecode_alu alu;
5343	int r;
5344
5345	switch (inst->Instruction.Opcode) {
5346	case TGSI_OPCODE_ARL:
5347		memset(&alu, 0, sizeof(alu));
5348		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
5349		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5350		alu.dst.sel = ctx->bc->ar_reg;
5351		alu.dst.write = 1;
5352		alu.last = 1;
5353
5354		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5355			return r;
5356
5357		memset(&alu, 0, sizeof(alu));
5358		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5359		alu.src[0].sel = ctx->bc->ar_reg;
5360		alu.dst.sel = ctx->bc->ar_reg;
5361		alu.dst.write = 1;
5362		alu.last = 1;
5363
5364		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5365			return r;
5366		break;
5367	case TGSI_OPCODE_ARR:
5368		memset(&alu, 0, sizeof(alu));
5369		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
5370		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5371		alu.dst.sel = ctx->bc->ar_reg;
5372		alu.dst.write = 1;
5373		alu.last = 1;
5374
5375		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5376			return r;
5377		break;
5378	case TGSI_OPCODE_UARL:
5379		memset(&alu, 0, sizeof(alu));
5380		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
5381		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5382		alu.dst.sel = ctx->bc->ar_reg;
5383		alu.dst.write = 1;
5384		alu.last = 1;
5385
5386		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5387			return r;
5388		break;
5389	default:
5390		assert(0);
5391		return -1;
5392	}
5393
5394	ctx->bc->ar_loaded = 0;
5395	return 0;
5396}
5397
5398static int tgsi_opdst(struct r600_shader_ctx *ctx)
5399{
5400	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5401	struct r600_bytecode_alu alu;
5402	int i, r = 0;
5403
5404	for (i = 0; i < 4; i++) {
5405		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5406
5407		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
5408		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5409
5410		if (i == 0 || i == 3) {
5411			alu.src[0].sel = V_SQ_ALU_SRC_1;
5412		} else {
5413			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5414		}
5415
5416		if (i == 0 || i == 2) {
5417			alu.src[1].sel = V_SQ_ALU_SRC_1;
5418		} else {
5419			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5420		}
5421		if (i == 3)
5422			alu.last = 1;
5423		r = r600_bytecode_add_alu(ctx->bc, &alu);
5424		if (r)
5425			return r;
5426	}
5427	return 0;
5428}
5429
5430static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
5431{
5432	struct r600_bytecode_alu alu;
5433	int r;
5434
5435	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5436	alu.inst = opcode;
5437	alu.execute_mask = 1;
5438	alu.update_pred = 1;
5439
5440	alu.dst.sel = ctx->temp_reg;
5441	alu.dst.write = 1;
5442	alu.dst.chan = 0;
5443
5444	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5445	alu.src[1].sel = V_SQ_ALU_SRC_0;
5446	alu.src[1].chan = 0;
5447
5448	alu.last = 1;
5449
5450	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
5451	if (r)
5452		return r;
5453	return 0;
5454}
5455
5456static int pops(struct r600_shader_ctx *ctx, int pops)
5457{
5458	unsigned force_pop = ctx->bc->force_add_cf;
5459
5460	if (!force_pop) {
5461		int alu_pop = 3;
5462		if (ctx->bc->cf_last) {
5463			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
5464				alu_pop = 0;
5465			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
5466				alu_pop = 1;
5467		}
5468		alu_pop += pops;
5469		if (alu_pop == 1) {
5470			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
5471			ctx->bc->force_add_cf = 1;
5472		} else if (alu_pop == 2) {
5473			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
5474			ctx->bc->force_add_cf = 1;
5475		} else {
5476			force_pop = 1;
5477		}
5478	}
5479
5480	if (force_pop) {
5481		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
5482		ctx->bc->cf_last->pop_count = pops;
5483		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
5484	}
5485
5486	return 0;
5487}
5488
5489static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
5490{
5491	switch(reason) {
5492	case FC_PUSH_VPM:
5493		ctx->bc->callstack[ctx->bc->call_sp].current--;
5494		break;
5495	case FC_PUSH_WQM:
5496	case FC_LOOP:
5497		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
5498		break;
5499	case FC_REP:
5500		/* TOODO : for 16 vp asic should -= 2; */
5501		ctx->bc->callstack[ctx->bc->call_sp].current --;
5502		break;
5503	}
5504}
5505
5506static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
5507{
5508	if (check_max_only) {
5509		int diff;
5510		switch (reason) {
5511		case FC_PUSH_VPM:
5512			diff = 1;
5513			break;
5514		case FC_PUSH_WQM:
5515			diff = 4;
5516			break;
5517		default:
5518			assert(0);
5519			diff = 0;
5520		}
5521		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
5522		    ctx->bc->callstack[ctx->bc->call_sp].max) {
5523			ctx->bc->callstack[ctx->bc->call_sp].max =
5524				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
5525		}
5526		return;
5527	}
5528	switch (reason) {
5529	case FC_PUSH_VPM:
5530		ctx->bc->callstack[ctx->bc->call_sp].current++;
5531		break;
5532	case FC_PUSH_WQM:
5533	case FC_LOOP:
5534		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
5535		break;
5536	case FC_REP:
5537		ctx->bc->callstack[ctx->bc->call_sp].current++;
5538		break;
5539	}
5540
5541	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
5542	    ctx->bc->callstack[ctx->bc->call_sp].max) {
5543		ctx->bc->callstack[ctx->bc->call_sp].max =
5544			ctx->bc->callstack[ctx->bc->call_sp].current;
5545	}
5546}
5547
5548static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
5549{
5550	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
5551
5552	sp->mid = realloc((void *)sp->mid,
5553						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5554	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5555	sp->num_mid++;
5556}
5557
5558static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5559{
5560	ctx->bc->fc_sp++;
5561	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5562	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5563}
5564
5565static void fc_poplevel(struct r600_shader_ctx *ctx)
5566{
5567	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5568	free(sp->mid);
5569	sp->mid = NULL;
5570	sp->num_mid = 0;
5571	sp->start = NULL;
5572	sp->type = 0;
5573	ctx->bc->fc_sp--;
5574}
5575
5576#if 0
5577static int emit_return(struct r600_shader_ctx *ctx)
5578{
5579	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5580	return 0;
5581}
5582
5583static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5584{
5585
5586	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5587	ctx->bc->cf_last->pop_count = pops;
5588	/* XXX work out offset */
5589	return 0;
5590}
5591
5592static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5593{
5594	return 0;
5595}
5596
5597static void emit_testflag(struct r600_shader_ctx *ctx)
5598{
5599
5600}
5601
5602static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5603{
5604	emit_testflag(ctx);
5605	emit_jump_to_offset(ctx, 1, 4);
5606	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5607	pops(ctx, ifidx + 1);
5608	emit_return(ctx);
5609}
5610
5611static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5612{
5613	emit_testflag(ctx);
5614
5615	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5616	ctx->bc->cf_last->pop_count = 1;
5617
5618	fc_set_mid(ctx, fc_sp);
5619
5620	pops(ctx, 1);
5621}
5622#endif
5623
5624static int tgsi_if(struct r600_shader_ctx *ctx)
5625{
5626	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5627
5628	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5629
5630	fc_pushlevel(ctx, FC_IF);
5631
5632	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5633	return 0;
5634}
5635
5636static int tgsi_else(struct r600_shader_ctx *ctx)
5637{
5638	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5639	ctx->bc->cf_last->pop_count = 1;
5640
5641	fc_set_mid(ctx, ctx->bc->fc_sp);
5642	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5643	return 0;
5644}
5645
5646static int tgsi_endif(struct r600_shader_ctx *ctx)
5647{
5648	pops(ctx, 1);
5649	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5650		R600_ERR("if/endif unbalanced in shader\n");
5651		return -1;
5652	}
5653
5654	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5655		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5656		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5657	} else {
5658		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5659	}
5660	fc_poplevel(ctx);
5661
5662	callstack_decrease_current(ctx, FC_PUSH_VPM);
5663	return 0;
5664}
5665
5666static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5667{
5668	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
5669	 * limited to 4096 iterations, like the other LOOP_* instructions. */
5670	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
5671
5672	fc_pushlevel(ctx, FC_LOOP);
5673
5674	/* check stack depth */
5675	callstack_check_depth(ctx, FC_LOOP, 0);
5676	return 0;
5677}
5678
5679static int tgsi_endloop(struct r600_shader_ctx *ctx)
5680{
5681	int i;
5682
5683	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5684
5685	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5686		R600_ERR("loop/endloop in shader code are not paired.\n");
5687		return -EINVAL;
5688	}
5689
5690	/* fixup loop pointers - from r600isa
5691	   LOOP END points to CF after LOOP START,
5692	   LOOP START point to CF after LOOP END
5693	   BRK/CONT point to LOOP END CF
5694	*/
5695	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5696
5697	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5698
5699	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5700		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5701	}
5702	/* XXX add LOOPRET support */
5703	fc_poplevel(ctx);
5704	callstack_decrease_current(ctx, FC_LOOP);
5705	return 0;
5706}
5707
5708static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5709{
5710	unsigned int fscp;
5711
5712	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5713	{
5714		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5715			break;
5716	}
5717
5718	if (fscp == 0) {
5719		R600_ERR("Break not inside loop/endloop pair\n");
5720		return -EINVAL;
5721	}
5722
5723	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5724
5725	fc_set_mid(ctx, fscp);
5726
5727	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5728	return 0;
5729}
5730
5731static int tgsi_umad(struct r600_shader_ctx *ctx)
5732{
5733	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5734	struct r600_bytecode_alu alu;
5735	int i, j, r;
5736	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5737
5738	/* src0 * src1 */
5739	for (i = 0; i < lasti + 1; i++) {
5740		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5741			continue;
5742
5743		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5744
5745		alu.dst.chan = i;
5746		alu.dst.sel = ctx->temp_reg;
5747		alu.dst.write = 1;
5748
5749		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5750		for (j = 0; j < 2; j++) {
5751		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5752		}
5753
5754		alu.last = 1;
5755		r = r600_bytecode_add_alu(ctx->bc, &alu);
5756		if (r)
5757			return r;
5758	}
5759
5760
5761	for (i = 0; i < lasti + 1; i++) {
5762		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5763			continue;
5764
5765		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5766		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5767
5768		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5769
5770		alu.src[0].sel = ctx->temp_reg;
5771		alu.src[0].chan = i;
5772
5773		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5774		if (i == lasti) {
5775			alu.last = 1;
5776		}
5777		r = r600_bytecode_add_alu(ctx->bc, &alu);
5778		if (r)
5779			return r;
5780	}
5781	return 0;
5782}
5783
5784static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5785	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5786	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5787	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5788
5789	/* XXX:
5790	 * For state trackers other than OpenGL, we'll want to use
5791	 * _RECIP_IEEE instead.
5792	 */
5793	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5794
5795	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5796	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5797	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5798	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5799	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5800	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5801	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5802	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5803	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5804	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5805	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5806	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5807	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5808	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5809	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5810	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5811	/* gap */
5812	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5813	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5814	/* gap */
5815	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5816	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5817	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5818	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5819	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5820	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5821	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5822	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5823	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5824	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5825	/* gap */
5826	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5827	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5828	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5829	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5830	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5831	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5832	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5833	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5834	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5835	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5836	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5837	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5838	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5839	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5840	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5841	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5842	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5843	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5844	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5845	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5846	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5847	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5848	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5849	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5850	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5851	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5852	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5853	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5854	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5855	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5856	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5857	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5858	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5859	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5860	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5861	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5862	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5863	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5864	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5865	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5866	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5867	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5868	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5869	/* gap */
5870	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5871	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5872	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5873	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5874	/* gap */
5875	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5876	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5877	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5878	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5879	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5880	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5881	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5882	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5883	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5884	/* gap */
5885	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5886	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5887	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5888	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5889	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5890	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5891	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5892	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5893	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5894	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5895	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5896	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5897	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5898	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5899	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5900	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5901	/* gap */
5902	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5903	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5904	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5905	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5906	/* gap */
5907	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5908	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5909	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5910	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5911	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5912	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5913	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5914	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5915	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5916	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5917	/* gap */
5918	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5919	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5920	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5921	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5922	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5923	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5924	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5925	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5926	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5927	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5928	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5929	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5930	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5931	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5932	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5933	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5934	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5935	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5936	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5937	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5938	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5939	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5940	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5941	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5942	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5943	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5944	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5945	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5946	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5947	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5948	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5949	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5950	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5951	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5952	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5953	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5954	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5955	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5956	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5957	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5958	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
5959	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5960	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5961	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5962	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5963	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5964	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5965	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5966	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5967	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5968	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5969	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5970	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5971	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5972	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5973	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5974	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5975	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5976	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5977	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5978	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5979	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5980	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5981};
5982
5983static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5984	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5985	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5986	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5987	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5988	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5989	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5990	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5991	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5992	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5993	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5994	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5995	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5996	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5997	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5998	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5999	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
6000	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
6001	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6002	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
6003	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6004	/* gap */
6005	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6006	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6007	/* gap */
6008	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6009	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6010	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
6011	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6012	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
6013	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
6014	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
6015	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
6016	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
6017	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
6018	/* gap */
6019	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6020	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6021	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6022	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6023	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
6024	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
6025	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
6026	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
6027	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6028	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6029	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6030	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6031	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6032	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
6033	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6034	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
6035	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
6036	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
6037	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
6038	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6039	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6040	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
6041	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6042	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6043	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6044	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6045	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6046	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6047	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6048	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6049	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6050	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6051	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6052	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
6053	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
6054	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
6055	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6056	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6057	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6058	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6059	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6060	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
6061	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
6062	/* gap */
6063	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6064	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6065	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
6066	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
6067	/* gap */
6068	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6069	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6070	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6071	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6072	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
6073	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
6074	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
6075	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
6076	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
6077	/* gap */
6078	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6079	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
6080	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
6081	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
6082	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
6083	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6084	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
6085	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6086	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
6087	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6088	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6089	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
6090	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6091	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
6092	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6093	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6094	/* gap */
6095	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6096	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6097	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6098	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6099	/* gap */
6100	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6101	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6102	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6103	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6104	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6105	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6106	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6107	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6108	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
6109	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
6110	/* gap */
6111	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6112	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
6113	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
6114	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
6115	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
6116	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
6117	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
6118	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
6119	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
6120	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
6121	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
6122	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
6123	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
6124	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
6125	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
6126	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
6127	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
6128	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
6129	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
6130	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
6131	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
6132	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
6133	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
6134	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6135	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6136	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6137	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6138	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
6139	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
6140	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
6141	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
6142	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
6143	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6144	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
6145	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
6146	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
6147	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
6148	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6149	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6150	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
6151	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
6152	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
6153	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
6154	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6155	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6156	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6157	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6158	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6159	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6160	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6161	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6162	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6163	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6164	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6165	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6166	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6167	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6168	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6169	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6170	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6171	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6172	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6173	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6174};
6175
6176static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
6177	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6178	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6179	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
6180	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
6181	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
6182	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
6183	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
6184	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
6185	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6186	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6187	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6188	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
6189	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
6190	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
6191	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
6192	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
6193	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
6194	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
6195	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
6196	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6197	/* gap */
6198	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6199	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6200	/* gap */
6201	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6202	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6203	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
6204	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6205	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
6206	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
6207	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
6208	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
6209	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
6210	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
6211	/* gap */
6212	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6213	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
6214	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6215	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6216	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
6217	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
6218	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
6219	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
6220	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6221	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6222	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6223	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6224	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6225	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
6226	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6227	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
6228	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
6229	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
6230	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
6231	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6232	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6233	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
6234	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6235	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6236	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6237	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6238	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6239	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6240	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6241	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
6242	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6243	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6244	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6245	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
6246	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
6247	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
6248	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6249	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6250	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6251	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
6252	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6253	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
6254	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
6255	/* gap */
6256	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6257	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6258	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
6259	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
6260	/* gap */
6261	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6262	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6263	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6264	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6265	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
6266	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
6267	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
6268	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
6269	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
6270	/* gap */
6271	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6272	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
6273	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
6274	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
6275	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
6276	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6277	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
6278	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6279	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
6280	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6281	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6282	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
6283	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6284	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
6285	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6286	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
6287	/* gap */
6288	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6289	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6290	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6291	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6292	/* gap */
6293	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6294	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6295	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6296	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6297	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6298	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6299	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6300	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6301	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
6302	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
6303	/* gap */
6304	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6305	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
6306	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
6307	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
6308	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
6309	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
6310	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
6311	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
6312	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
6313	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
6314	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
6315	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
6316	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
6317	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
6318	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
6319	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
6320	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
6321	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
6322	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
6323	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
6324	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
6325	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
6326	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
6327	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6328	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6329	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6330	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6331	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
6332	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
6333	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
6334	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
6335	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
6336	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6337	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
6338	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
6339	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
6340	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
6341	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6342	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6343	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
6344	{TGSI_OPCODE_UCMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ucmp},
6345	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
6346	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
6347	{TGSI_OPCODE_LOAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6348	{TGSI_OPCODE_STORE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6349	{TGSI_OPCODE_MFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6350	{TGSI_OPCODE_LFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6351	{TGSI_OPCODE_SFENCE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6352	{TGSI_OPCODE_BARRIER,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6353	{TGSI_OPCODE_ATOMUADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6354	{TGSI_OPCODE_ATOMXCHG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6355	{TGSI_OPCODE_ATOMCAS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6356	{TGSI_OPCODE_ATOMAND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6357	{TGSI_OPCODE_ATOMOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6358	{TGSI_OPCODE_ATOMXOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6359	{TGSI_OPCODE_ATOMUMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6360	{TGSI_OPCODE_ATOMUMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6361	{TGSI_OPCODE_ATOMIMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6362	{TGSI_OPCODE_ATOMIMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6363	{TGSI_OPCODE_TEX2,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
6364	{TGSI_OPCODE_TXB2,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
6365	{TGSI_OPCODE_TXL2,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
6366	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
6367};
6368