r600_shader.c revision 6d3ad2dd2ba3ccdd211dbc618404519930631be2
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader);
107
108int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader)
109{
110	static int dump_shaders = -1;
111	struct r600_context *rctx = (struct r600_context *)ctx;
112	struct r600_pipe_shader_selector *sel = shader->selector;
113	int r;
114
115	/* Would like some magic "get_bool_option_once" routine.
116	*/
117	if (dump_shaders == -1)
118		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
119
120	if (dump_shaders) {
121		fprintf(stderr, "--------------------------------------------------------------\n");
122		tgsi_dump(sel->tokens, 0);
123
124		if (sel->so.num_outputs) {
125			unsigned i;
126			fprintf(stderr, "STREAMOUT\n");
127			for (i = 0; i < sel->so.num_outputs; i++) {
128				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
129						sel->so.output[i].start_component;
130				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
131					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
132				        mask & 1 ? "x" : "_",
133				        (mask >> 1) & 1 ? "y" : "_",
134				        (mask >> 2) & 1 ? "z" : "_",
135				        (mask >> 3) & 1 ? "w" : "_");
136			}
137		}
138	}
139	r = r600_shader_from_tgsi(rctx, shader);
140	if (r) {
141		R600_ERR("translation from TGSI failed !\n");
142		return r;
143	}
144	r = r600_bytecode_build(&shader->shader.bc);
145	if (r) {
146		R600_ERR("building bytecode failed !\n");
147		return r;
148	}
149	if (dump_shaders) {
150		r600_bytecode_dump(&shader->shader.bc);
151		fprintf(stderr, "______________________________________________________________\n");
152	}
153	return r600_pipe_shader(ctx, shader);
154}
155
156void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
157{
158	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
159	r600_bytecode_clear(&shader->shader.bc);
160}
161
162/*
163 * tgsi -> r600 shader
164 */
165struct r600_shader_tgsi_instruction;
166
167struct r600_shader_src {
168	unsigned				sel;
169	unsigned				swizzle[4];
170	unsigned				neg;
171	unsigned				abs;
172	unsigned				rel;
173	uint32_t				value[4];
174};
175
176struct r600_shader_ctx {
177	struct tgsi_shader_info			info;
178	struct tgsi_parse_context		parse;
179	const struct tgsi_token			*tokens;
180	unsigned				type;
181	unsigned				file_offset[TGSI_FILE_COUNT];
182	unsigned				temp_reg;
183	struct r600_shader_tgsi_instruction	*inst_info;
184	struct r600_bytecode			*bc;
185	struct r600_shader			*shader;
186	struct r600_shader_src			src[4];
187	uint32_t				*literals;
188	uint32_t				nliterals;
189	uint32_t				max_driver_temp_used;
190	/* needed for evergreen interpolation */
191	boolean                                 input_centroid;
192	boolean                                 input_linear;
193	boolean                                 input_perspective;
194	int					num_interp_gpr;
195	int					face_gpr;
196	int					colors_used;
197	boolean                 clip_vertex_write;
198	unsigned                cv_output;
199	int					fragcoord_input;
200	int					native_integers;
201};
202
203struct r600_shader_tgsi_instruction {
204	unsigned	tgsi_opcode;
205	unsigned	is_op3;
206	unsigned	r600_opcode;
207	int (*process)(struct r600_shader_ctx *ctx);
208};
209
210static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
211static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
212static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
213static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
214static int tgsi_else(struct r600_shader_ctx *ctx);
215static int tgsi_endif(struct r600_shader_ctx *ctx);
216static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
217static int tgsi_endloop(struct r600_shader_ctx *ctx);
218static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
219
220/*
221 * bytestream -> r600 shader
222 *
223 * These functions are used to transform the output of the LLVM backend into
224 * struct r600_bytecode.
225 */
226
227static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
228				unsigned char * bytes,	unsigned num_bytes);
229
230#ifdef HAVE_OPENCL
231int r600_compute_shader_create(struct pipe_context * ctx,
232	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
233{
234	struct r600_context *r600_ctx = (struct r600_context *)ctx;
235	unsigned char * bytes;
236	unsigned byte_count;
237	struct r600_shader_ctx shader_ctx;
238	unsigned dump = 0;
239
240	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
241		dump = 1;
242	}
243
244	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
245	shader_ctx.bc = bytecode;
246	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
247	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
248	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
249	if (shader_ctx.bc->chip_class == CAYMAN) {
250		cm_bytecode_add_cf_end(shader_ctx.bc);
251	}
252	r600_bytecode_build(shader_ctx.bc);
253	if (dump) {
254		r600_bytecode_dump(shader_ctx.bc);
255	}
256	return 1;
257}
258
259#endif /* HAVE_OPENCL */
260
261static uint32_t i32_from_byte_stream(unsigned char * bytes,
262		unsigned * bytes_read)
263{
264	unsigned i;
265	uint32_t out = 0;
266	for (i = 0; i < 4; i++) {
267		out |= bytes[(*bytes_read)++] << (8 * i);
268	}
269	return out;
270}
271
272static unsigned r600_src_from_byte_stream(unsigned char * bytes,
273		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
274{
275	unsigned i;
276	unsigned sel0, sel1;
277	sel0 = bytes[bytes_read++];
278	sel1 = bytes[bytes_read++];
279	alu->src[src_idx].sel = sel0 | (sel1 << 8);
280	alu->src[src_idx].chan = bytes[bytes_read++];
281	alu->src[src_idx].neg = bytes[bytes_read++];
282	alu->src[src_idx].abs = bytes[bytes_read++];
283	alu->src[src_idx].rel = bytes[bytes_read++];
284	alu->src[src_idx].kc_bank = bytes[bytes_read++];
285	for (i = 0; i < 4; i++) {
286		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
287	}
288	return bytes_read;
289}
290
291static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
292				unsigned char * bytes, unsigned bytes_read)
293{
294	unsigned src_idx;
295	unsigned inst0, inst1;
296	struct r600_bytecode_alu alu;
297	memset(&alu, 0, sizeof(alu));
298	for(src_idx = 0; src_idx < 3; src_idx++) {
299		bytes_read = r600_src_from_byte_stream(bytes, bytes_read,
300								&alu, src_idx);
301	}
302
303	alu.dst.sel = bytes[bytes_read++];
304	alu.dst.chan = bytes[bytes_read++];
305	alu.dst.clamp = bytes[bytes_read++];
306	alu.dst.write = bytes[bytes_read++];
307	alu.dst.rel = bytes[bytes_read++];
308	inst0 = bytes[bytes_read++];
309	inst1 = bytes[bytes_read++];
310	alu.inst = inst0 | (inst1 << 8);
311	alu.last = bytes[bytes_read++];
312	alu.is_op3 = bytes[bytes_read++];
313	alu.predicate = bytes[bytes_read++];
314	alu.bank_swizzle = bytes[bytes_read++];
315	alu.bank_swizzle_force = bytes[bytes_read++];
316	alu.omod = bytes[bytes_read++];
317	alu.index_mode = bytes[bytes_read++];
318	r600_bytecode_add_alu(ctx->bc, &alu);
319
320	/* XXX: Handle other KILL instructions */
321	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
322		ctx->shader->uses_kill = 1;
323		/* XXX: This should be enforced in the LLVM backend. */
324		ctx->bc->force_add_cf = 1;
325	}
326	return bytes_read;
327}
328
329static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
330	unsigned pred_inst)
331{
332	alu->inst = pred_inst;
333	alu->predicate = 1;
334	alu->dst.write = 0;
335	alu->src[1].sel = V_SQ_ALU_SRC_0;
336	alu->src[1].chan = 0;
337	alu->last = 1;
338	r600_bytecode_add_alu_type(ctx->bc, alu,
339		CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
340
341	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
342	fc_pushlevel(ctx, FC_IF);
343	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
344}
345
346static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
347			struct r600_bytecode_alu *alu, unsigned compare_opcode)
348{
349	unsigned opcode = TGSI_OPCODE_BRK;
350	if (ctx->bc->chip_class == CAYMAN)
351		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
352	else if (ctx->bc->chip_class >= EVERGREEN)
353		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
354	else
355		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
356	llvm_if(ctx, alu, compare_opcode);
357	tgsi_loop_brk_cont(ctx);
358	tgsi_endif(ctx);
359}
360
361static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
362				unsigned char * bytes, unsigned bytes_read)
363{
364	struct r600_bytecode_alu alu;
365	unsigned inst;
366	memset(&alu, 0, sizeof(alu));
367	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
368	inst = bytes[bytes_read++];
369	switch (inst) {
370	case 0: /* FC_IF */
371		llvm_if(ctx, &alu,
372			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
373		break;
374	case 1: /* FC_IF_INT */
375		llvm_if(ctx, &alu,
376			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
377		break;
378	case 2: /* FC_ELSE */
379		tgsi_else(ctx);
380		break;
381	case 3: /* FC_ENDIF */
382		tgsi_endif(ctx);
383		break;
384	case 4: /* FC_BGNLOOP */
385		tgsi_bgnloop(ctx);
386		break;
387	case 5: /* FC_ENDLOOP */
388		tgsi_endloop(ctx);
389		break;
390	case 6: /* FC_BREAK */
391		r600_break_from_byte_stream(ctx, &alu,
392			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
393		break;
394	case 7: /* FC_BREAK_NZ_INT */
395		r600_break_from_byte_stream(ctx, &alu,
396			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
397		break;
398	case 8: /* FC_CONTINUE */
399		{
400			unsigned opcode = TGSI_OPCODE_CONT;
401			if (ctx->bc->chip_class == CAYMAN) {
402				ctx->inst_info =
403					&cm_shader_tgsi_instruction[opcode];
404			} else if (ctx->bc->chip_class >= EVERGREEN) {
405				ctx->inst_info =
406					&eg_shader_tgsi_instruction[opcode];
407			} else {
408				ctx->inst_info =
409					&r600_shader_tgsi_instruction[opcode];
410			}
411			tgsi_loop_brk_cont(ctx);
412		}
413		break;
414	case 9: /* FC_BREAK_Z_INT */
415		r600_break_from_byte_stream(ctx, &alu,
416			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
417		break;
418	case 10: /* FC_BREAK_NZ */
419		r600_break_from_byte_stream(ctx, &alu,
420			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
421		break;
422	}
423
424	return bytes_read;
425}
426
427static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
428				unsigned char * bytes, unsigned bytes_read)
429{
430	struct r600_bytecode_tex tex;
431
432	tex.inst = bytes[bytes_read++];
433	tex.resource_id = bytes[bytes_read++];
434	tex.src_gpr = bytes[bytes_read++];
435	tex.src_rel = bytes[bytes_read++];
436	tex.dst_gpr = bytes[bytes_read++];
437	tex.dst_rel = bytes[bytes_read++];
438	tex.dst_sel_x = bytes[bytes_read++];
439	tex.dst_sel_y = bytes[bytes_read++];
440	tex.dst_sel_z = bytes[bytes_read++];
441	tex.dst_sel_w = bytes[bytes_read++];
442	tex.lod_bias = bytes[bytes_read++];
443	tex.coord_type_x = bytes[bytes_read++];
444	tex.coord_type_y = bytes[bytes_read++];
445	tex.coord_type_z = bytes[bytes_read++];
446	tex.coord_type_w = bytes[bytes_read++];
447	tex.offset_x = bytes[bytes_read++];
448	tex.offset_y = bytes[bytes_read++];
449	tex.offset_z = bytes[bytes_read++];
450	tex.sampler_id = bytes[bytes_read++];
451	tex.src_sel_x = bytes[bytes_read++];
452	tex.src_sel_y = bytes[bytes_read++];
453	tex.src_sel_z = bytes[bytes_read++];
454	tex.src_sel_w = bytes[bytes_read++];
455
456	r600_bytecode_add_tex(ctx->bc, &tex);
457
458	return bytes_read;
459}
460
461static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
462	unsigned char * bytes, unsigned bytes_read)
463{
464	struct r600_bytecode_vtx vtx;
465
466	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
467        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
468	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
469
470	memset(&vtx, 0, sizeof(vtx));
471
472	/* WORD0 */
473	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
474	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
475	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
476	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
477	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
478	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
479
480	/* WORD1 */
481	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
482	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
483	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
484	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
485	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
486	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
487	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
488	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
489	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
490	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
491
492	/* WORD 2*/
493	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
494	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
495
496	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
497		fprintf(stderr, "Error adding vtx\n");
498	}
499	/* Use the Texture Cache */
500	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
501	return bytes_read;
502}
503
504static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
505				unsigned char * bytes,	unsigned num_bytes)
506{
507	unsigned bytes_read = 0;
508	unsigned i, byte;
509	while (bytes_read < num_bytes) {
510		char inst_type = bytes[bytes_read++];
511		switch (inst_type) {
512		case 0:
513			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
514								bytes_read);
515			break;
516		case 1:
517			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
518								bytes_read);
519			break;
520		case 2:
521			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
522								bytes_read);
523			break;
524		case 3:
525			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
526			for (i = 0; i < 2; i++) {
527				for (byte = 0 ; byte < 4; byte++) {
528					ctx->bc->cf_last->isa[i] |=
529					(bytes[bytes_read++] << (byte * 8));
530				}
531			}
532			break;
533
534		case 4:
535			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
536								bytes_read);
537			break;
538		default:
539			/* XXX: Error here */
540			break;
541		}
542	}
543}
544
545/* End bytestream -> r600 shader functions*/
546
547static int tgsi_is_supported(struct r600_shader_ctx *ctx)
548{
549	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
550	int j;
551
552	if (i->Instruction.NumDstRegs > 1) {
553		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
554		return -EINVAL;
555	}
556	if (i->Instruction.Predicate) {
557		R600_ERR("predicate unsupported\n");
558		return -EINVAL;
559	}
560#if 0
561	if (i->Instruction.Label) {
562		R600_ERR("label unsupported\n");
563		return -EINVAL;
564	}
565#endif
566	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
567		if (i->Src[j].Register.Dimension) {
568			R600_ERR("unsupported src %d (dimension %d)\n", j,
569				 i->Src[j].Register.Dimension);
570			return -EINVAL;
571		}
572	}
573	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
574		if (i->Dst[j].Register.Dimension) {
575			R600_ERR("unsupported dst (dimension)\n");
576			return -EINVAL;
577		}
578	}
579	return 0;
580}
581
582static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
583{
584	int i, r;
585	struct r600_bytecode_alu alu;
586	int gpr = 0, base_chan = 0;
587	int ij_index = 0;
588
589	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
590		ij_index = 0;
591		if (ctx->shader->input[input].centroid)
592			ij_index++;
593	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
594		ij_index = 0;
595		/* if we have perspective add one */
596		if (ctx->input_perspective)  {
597			ij_index++;
598			/* if we have perspective centroid */
599			if (ctx->input_centroid)
600				ij_index++;
601		}
602		if (ctx->shader->input[input].centroid)
603			ij_index++;
604	}
605
606	/* work out gpr and base_chan from index */
607	gpr = ij_index / 2;
608	base_chan = (2 * (ij_index % 2)) + 1;
609
610	for (i = 0; i < 8; i++) {
611		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
612
613		if (i < 4)
614			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
615		else
616			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
617
618		if ((i > 1) && (i < 6)) {
619			alu.dst.sel = ctx->shader->input[input].gpr;
620			alu.dst.write = 1;
621		}
622
623		alu.dst.chan = i % 4;
624
625		alu.src[0].sel = gpr;
626		alu.src[0].chan = (base_chan - (i % 2));
627
628		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
629
630		alu.bank_swizzle_force = SQ_ALU_VEC_210;
631		if ((i % 4) == 3)
632			alu.last = 1;
633		r = r600_bytecode_add_alu(ctx->bc, &alu);
634		if (r)
635			return r;
636	}
637	return 0;
638}
639
640static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
641{
642	int i, r;
643	struct r600_bytecode_alu alu;
644
645	for (i = 0; i < 4; i++) {
646		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
647
648		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
649
650		alu.dst.sel = ctx->shader->input[input].gpr;
651		alu.dst.write = 1;
652
653		alu.dst.chan = i;
654
655		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
656		alu.src[0].chan = i;
657
658		if (i == 3)
659			alu.last = 1;
660		r = r600_bytecode_add_alu(ctx->bc, &alu);
661		if (r)
662			return r;
663	}
664	return 0;
665}
666
667/*
668 * Special export handling in shaders
669 *
670 * shader export ARRAY_BASE for EXPORT_POS:
671 * 60 is position
672 * 61 is misc vector
673 * 62, 63 are clip distance vectors
674 *
675 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
676 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
677 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
678 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
679 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
680 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
681 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
682 * exclusive from render target index)
683 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
684 *
685 *
686 * shader export ARRAY_BASE for EXPORT_PIXEL:
687 * 0-7 CB targets
688 * 61 computed Z vector
689 *
690 * The use of the values exported in the computed Z vector are controlled
691 * by DB_SHADER_CONTROL:
692 * Z_EXPORT_ENABLE - Z as a float in RED
693 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
694 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
695 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
696 * DB_SOURCE_FORMAT - export control restrictions
697 *
698 */
699
700
701/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
702static int r600_spi_sid(struct r600_shader_io * io)
703{
704	int index, name = io->name;
705
706	/* These params are handled differently, they don't need
707	 * semantic indices, so we'll use 0 for them.
708	 */
709	if (name == TGSI_SEMANTIC_POSITION ||
710		name == TGSI_SEMANTIC_PSIZE ||
711		name == TGSI_SEMANTIC_FACE)
712		index = 0;
713	else {
714		if (name == TGSI_SEMANTIC_GENERIC) {
715			/* For generic params simply use sid from tgsi */
716			index = io->sid;
717		} else {
718			/* For non-generic params - pack name and sid into 8 bits */
719			index = 0x80 | (name<<3) | (io->sid);
720		}
721
722		/* Make sure that all really used indices have nonzero value, so
723		 * we can just compare it to 0 later instead of comparing the name
724		 * with different values to detect special cases. */
725		index++;
726	}
727
728	return index;
729};
730
731/* turn input into interpolate on EG */
732static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
733{
734	int r = 0;
735
736	if (ctx->shader->input[index].spi_sid) {
737		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
738		if (ctx->shader->input[index].interpolate > 0) {
739			r = evergreen_interp_alu(ctx, index);
740		} else {
741			r = evergreen_interp_flat(ctx, index);
742		}
743	}
744	return r;
745}
746
747static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
748{
749	struct r600_bytecode_alu alu;
750	int i, r;
751	int gpr_front = ctx->shader->input[front].gpr;
752	int gpr_back = ctx->shader->input[back].gpr;
753
754	for (i = 0; i < 4; i++) {
755		memset(&alu, 0, sizeof(alu));
756		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
757		alu.is_op3 = 1;
758		alu.dst.write = 1;
759		alu.dst.sel = gpr_front;
760		alu.src[0].sel = ctx->face_gpr;
761		alu.src[1].sel = gpr_front;
762		alu.src[2].sel = gpr_back;
763
764		alu.dst.chan = i;
765		alu.src[1].chan = i;
766		alu.src[2].chan = i;
767		alu.last = (i==3);
768
769		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
770			return r;
771	}
772
773	return 0;
774}
775
776static int tgsi_declaration(struct r600_shader_ctx *ctx)
777{
778	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
779	unsigned i;
780	int r;
781
782	switch (d->Declaration.File) {
783	case TGSI_FILE_INPUT:
784		i = ctx->shader->ninput++;
785		ctx->shader->input[i].name = d->Semantic.Name;
786		ctx->shader->input[i].sid = d->Semantic.Index;
787		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
788		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
789		ctx->shader->input[i].centroid = d->Interp.Centroid;
790		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
791		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
792			switch (ctx->shader->input[i].name) {
793			case TGSI_SEMANTIC_FACE:
794				ctx->face_gpr = ctx->shader->input[i].gpr;
795				break;
796			case TGSI_SEMANTIC_COLOR:
797				ctx->colors_used++;
798				break;
799			case TGSI_SEMANTIC_POSITION:
800				ctx->fragcoord_input = i;
801				break;
802			}
803			if (ctx->bc->chip_class >= EVERGREEN) {
804				if ((r = evergreen_interp_input(ctx, i)))
805					return r;
806			}
807		}
808		break;
809	case TGSI_FILE_OUTPUT:
810		i = ctx->shader->noutput++;
811		ctx->shader->output[i].name = d->Semantic.Name;
812		ctx->shader->output[i].sid = d->Semantic.Index;
813		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
814		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
815		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
816		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
817		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
818			switch (d->Semantic.Name) {
819			case TGSI_SEMANTIC_CLIPDIST:
820				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
821				break;
822			case TGSI_SEMANTIC_PSIZE:
823				ctx->shader->vs_out_misc_write = 1;
824				ctx->shader->vs_out_point_size = 1;
825				break;
826			case TGSI_SEMANTIC_CLIPVERTEX:
827				ctx->clip_vertex_write = TRUE;
828				ctx->cv_output = i;
829				break;
830			}
831		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
832			switch (d->Semantic.Name) {
833			case TGSI_SEMANTIC_COLOR:
834				ctx->shader->nr_ps_max_color_exports++;
835				break;
836			}
837		}
838		break;
839	case TGSI_FILE_CONSTANT:
840	case TGSI_FILE_TEMPORARY:
841	case TGSI_FILE_SAMPLER:
842	case TGSI_FILE_ADDRESS:
843		break;
844
845	case TGSI_FILE_SYSTEM_VALUE:
846		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
847			if (!ctx->native_integers) {
848				struct r600_bytecode_alu alu;
849				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
850
851				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
852				alu.src[0].sel = 0;
853				alu.src[0].chan = 3;
854
855				alu.dst.sel = 0;
856				alu.dst.chan = 3;
857				alu.dst.write = 1;
858				alu.last = 1;
859
860				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
861					return r;
862			}
863			break;
864		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
865			break;
866	default:
867		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
868		return -EINVAL;
869	}
870	return 0;
871}
872
873static int r600_get_temp(struct r600_shader_ctx *ctx)
874{
875	return ctx->temp_reg + ctx->max_driver_temp_used++;
876}
877
878/*
879 * for evergreen we need to scan the shader to find the number of GPRs we need to
880 * reserve for interpolation.
881 *
882 * we need to know if we are going to emit
883 * any centroid inputs
884 * if perspective and linear are required
885*/
886static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
887{
888	int i;
889	int num_baryc;
890
891	ctx->input_linear = FALSE;
892	ctx->input_perspective = FALSE;
893	ctx->input_centroid = FALSE;
894	ctx->num_interp_gpr = 1;
895
896	/* any centroid inputs */
897	for (i = 0; i < ctx->info.num_inputs; i++) {
898		/* skip position/face */
899		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
900		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
901			continue;
902		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
903			ctx->input_linear = TRUE;
904		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
905			ctx->input_perspective = TRUE;
906		if (ctx->info.input_centroid[i])
907			ctx->input_centroid = TRUE;
908	}
909
910	num_baryc = 0;
911	/* ignoring sample for now */
912	if (ctx->input_perspective)
913		num_baryc++;
914	if (ctx->input_linear)
915		num_baryc++;
916	if (ctx->input_centroid)
917		num_baryc *= 2;
918
919	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
920
921	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
922	return ctx->num_interp_gpr;
923}
924
925static void tgsi_src(struct r600_shader_ctx *ctx,
926		     const struct tgsi_full_src_register *tgsi_src,
927		     struct r600_shader_src *r600_src)
928{
929	memset(r600_src, 0, sizeof(*r600_src));
930	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
931	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
932	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
933	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
934	r600_src->neg = tgsi_src->Register.Negate;
935	r600_src->abs = tgsi_src->Register.Absolute;
936
937	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
938		int index;
939		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
940			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
941			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
942
943			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
944			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
945			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
946				return;
947		}
948		index = tgsi_src->Register.Index;
949		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
950		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
951	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
952		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
953			r600_src->swizzle[0] = 3;
954			r600_src->swizzle[1] = 3;
955			r600_src->swizzle[2] = 3;
956			r600_src->swizzle[3] = 3;
957			r600_src->sel = 0;
958		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
959			r600_src->swizzle[0] = 0;
960			r600_src->swizzle[1] = 0;
961			r600_src->swizzle[2] = 0;
962			r600_src->swizzle[3] = 0;
963			r600_src->sel = 0;
964		}
965	} else {
966		if (tgsi_src->Register.Indirect)
967			r600_src->rel = V_SQ_REL_RELATIVE;
968		r600_src->sel = tgsi_src->Register.Index;
969		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
970	}
971}
972
973static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
974{
975	struct r600_bytecode_vtx vtx;
976	unsigned int ar_reg;
977	int r;
978
979	if (offset) {
980		struct r600_bytecode_alu alu;
981
982		memset(&alu, 0, sizeof(alu));
983
984		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
985		alu.src[0].sel = ctx->bc->ar_reg;
986
987		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
988		alu.src[1].value = offset;
989
990		alu.dst.sel = dst_reg;
991		alu.dst.write = 1;
992		alu.last = 1;
993
994		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
995			return r;
996
997		ar_reg = dst_reg;
998	} else {
999		ar_reg = ctx->bc->ar_reg;
1000	}
1001
1002	memset(&vtx, 0, sizeof(vtx));
1003	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1004	vtx.src_gpr = ar_reg;
1005	vtx.mega_fetch_count = 16;
1006	vtx.dst_gpr = dst_reg;
1007	vtx.dst_sel_x = 0;		/* SEL_X */
1008	vtx.dst_sel_y = 1;		/* SEL_Y */
1009	vtx.dst_sel_z = 2;		/* SEL_Z */
1010	vtx.dst_sel_w = 3;		/* SEL_W */
1011	vtx.data_format = FMT_32_32_32_32_FLOAT;
1012	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1013	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1014	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1015	vtx.endian = r600_endian_swap(32);
1016
1017	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1018		return r;
1019
1020	return 0;
1021}
1022
1023static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1024{
1025	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1026	struct r600_bytecode_alu alu;
1027	int i, j, k, nconst, r;
1028
1029	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1030		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1031			nconst++;
1032		}
1033		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1034	}
1035	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1036		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1037			continue;
1038		}
1039
1040		if (ctx->src[i].rel) {
1041			int treg = r600_get_temp(ctx);
1042			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1043				return r;
1044
1045			ctx->src[i].sel = treg;
1046			ctx->src[i].rel = 0;
1047			j--;
1048		} else if (j > 0) {
1049			int treg = r600_get_temp(ctx);
1050			for (k = 0; k < 4; k++) {
1051				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1052				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1053				alu.src[0].sel = ctx->src[i].sel;
1054				alu.src[0].chan = k;
1055				alu.src[0].rel = ctx->src[i].rel;
1056				alu.dst.sel = treg;
1057				alu.dst.chan = k;
1058				alu.dst.write = 1;
1059				if (k == 3)
1060					alu.last = 1;
1061				r = r600_bytecode_add_alu(ctx->bc, &alu);
1062				if (r)
1063					return r;
1064			}
1065			ctx->src[i].sel = treg;
1066			ctx->src[i].rel =0;
1067			j--;
1068		}
1069	}
1070	return 0;
1071}
1072
1073/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1074static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1075{
1076	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1077	struct r600_bytecode_alu alu;
1078	int i, j, k, nliteral, r;
1079
1080	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1081		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1082			nliteral++;
1083		}
1084	}
1085	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1086		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1087			int treg = r600_get_temp(ctx);
1088			for (k = 0; k < 4; k++) {
1089				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1090				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1091				alu.src[0].sel = ctx->src[i].sel;
1092				alu.src[0].chan = k;
1093				alu.src[0].value = ctx->src[i].value[k];
1094				alu.dst.sel = treg;
1095				alu.dst.chan = k;
1096				alu.dst.write = 1;
1097				if (k == 3)
1098					alu.last = 1;
1099				r = r600_bytecode_add_alu(ctx->bc, &alu);
1100				if (r)
1101					return r;
1102			}
1103			ctx->src[i].sel = treg;
1104			j--;
1105		}
1106	}
1107	return 0;
1108}
1109
1110static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1111{
1112	int i, r, count = ctx->shader->ninput;
1113
1114	/* additional inputs will be allocated right after the existing inputs,
1115	 * we won't need them after the color selection, so we don't need to
1116	 * reserve these gprs for the rest of the shader code and to adjust
1117	 * output offsets etc. */
1118	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
1119			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
1120
1121	if (ctx->face_gpr == -1) {
1122		i = ctx->shader->ninput++;
1123		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
1124		ctx->shader->input[i].spi_sid = 0;
1125		ctx->shader->input[i].gpr = gpr++;
1126		ctx->face_gpr = ctx->shader->input[i].gpr;
1127	}
1128
1129	for (i = 0; i < count; i++) {
1130		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1131			int ni = ctx->shader->ninput++;
1132			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
1133			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1134			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
1135			ctx->shader->input[ni].gpr = gpr++;
1136
1137			if (ctx->bc->chip_class >= EVERGREEN) {
1138				r = evergreen_interp_input(ctx, ni);
1139				if (r)
1140					return r;
1141			}
1142
1143			r = select_twoside_color(ctx, i, ni);
1144			if (r)
1145				return r;
1146		}
1147	}
1148	return 0;
1149}
1150
1151static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader)
1152{
1153	struct r600_shader *shader = &pipeshader->shader;
1154	struct tgsi_token *tokens = pipeshader->selector->tokens;
1155	struct pipe_stream_output_info so = pipeshader->selector->so;
1156	struct tgsi_full_immediate *immediate;
1157	struct tgsi_full_property *property;
1158	struct r600_shader_ctx ctx;
1159	struct r600_bytecode_output output[32];
1160	unsigned output_done, noutput;
1161	unsigned opcode;
1162	int i, j, k, r = 0;
1163	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1164	/* Declarations used by llvm code */
1165	bool use_llvm = false;
1166	unsigned char * inst_bytes = NULL;
1167	unsigned inst_byte_count = 0;
1168
1169#ifdef R600_USE_LLVM
1170	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1171#endif
1172	ctx.bc = &shader->bc;
1173	ctx.shader = shader;
1174	ctx.native_integers = true;
1175
1176	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
1177	ctx.tokens = tokens;
1178	tgsi_scan_shader(tokens, &ctx.info);
1179	tgsi_parse_init(&ctx.parse, tokens);
1180	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1181	shader->processor_type = ctx.type;
1182	ctx.bc->type = shader->processor_type;
1183
1184	ctx.face_gpr = -1;
1185	ctx.fragcoord_input = -1;
1186	ctx.colors_used = 0;
1187	ctx.clip_vertex_write = 0;
1188
1189	shader->nr_ps_color_exports = 0;
1190	shader->nr_ps_max_color_exports = 0;
1191
1192	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
1193
1194	/* register allocations */
1195	/* Values [0,127] correspond to GPR[0..127].
1196	 * Values [128,159] correspond to constant buffer bank 0
1197	 * Values [160,191] correspond to constant buffer bank 1
1198	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1199	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1200	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1201	 * Other special values are shown in the list below.
1202	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1203	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1204	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1205	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1206	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1207	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1208	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1209	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1210	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1211	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1212	 * 254	SQ_ALU_SRC_PV: previous vector result.
1213	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1214	 */
1215	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1216		ctx.file_offset[i] = 0;
1217	}
1218	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1219		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1220		if (ctx.bc->chip_class >= EVERGREEN) {
1221			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1222		} else {
1223			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1224		}
1225	}
1226	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1227		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1228	}
1229
1230	/* LLVM backend setup */
1231#ifdef R600_USE_LLVM
1232	if (use_llvm && ctx.info.indirect_files) {
1233		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1234				"indirect adressing.  Falling back to TGSI "
1235				"backend.\n");
1236		use_llvm = 0;
1237	}
1238	if (use_llvm) {
1239		struct radeon_llvm_context radeon_llvm_ctx;
1240		LLVMModuleRef mod;
1241		unsigned dump = 0;
1242		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1243		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1244		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1245		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1246			dump = 1;
1247		}
1248		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1249							rctx->family, dump)) {
1250			FREE(inst_bytes);
1251			radeon_llvm_dispose(&radeon_llvm_ctx);
1252			use_llvm = 0;
1253			fprintf(stderr, "R600 LLVM backend failed to compile "
1254				"shader.  Falling back to TGSI\n");
1255		} else {
1256			ctx.file_offset[TGSI_FILE_OUTPUT] =
1257					ctx.file_offset[TGSI_FILE_INPUT];
1258		}
1259		radeon_llvm_dispose(&radeon_llvm_ctx);
1260	}
1261#endif
1262	/* End of LLVM backend setup */
1263
1264	if (!use_llvm) {
1265		ctx.file_offset[TGSI_FILE_OUTPUT] =
1266			ctx.file_offset[TGSI_FILE_INPUT] +
1267			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1268	}
1269	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1270						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1271
1272	/* Outside the GPR range. This will be translated to one of the
1273	 * kcache banks later. */
1274	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1275
1276	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1277	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1278			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1279	ctx.temp_reg = ctx.bc->ar_reg + 1;
1280
1281	ctx.nliterals = 0;
1282	ctx.literals = NULL;
1283	shader->fs_write_all = FALSE;
1284	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1285		tgsi_parse_token(&ctx.parse);
1286		switch (ctx.parse.FullToken.Token.Type) {
1287		case TGSI_TOKEN_TYPE_IMMEDIATE:
1288			immediate = &ctx.parse.FullToken.FullImmediate;
1289			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1290			if(ctx.literals == NULL) {
1291				r = -ENOMEM;
1292				goto out_err;
1293			}
1294			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1295			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1296			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1297			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1298			ctx.nliterals++;
1299			break;
1300		case TGSI_TOKEN_TYPE_DECLARATION:
1301			r = tgsi_declaration(&ctx);
1302			if (r)
1303				goto out_err;
1304			break;
1305		case TGSI_TOKEN_TYPE_INSTRUCTION:
1306			break;
1307		case TGSI_TOKEN_TYPE_PROPERTY:
1308			property = &ctx.parse.FullToken.FullProperty;
1309			switch (property->Property.PropertyName) {
1310			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1311				if (property->u[0].Data == 1)
1312					shader->fs_write_all = TRUE;
1313				break;
1314			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1315				if (property->u[0].Data == 1)
1316					shader->vs_prohibit_ucps = TRUE;
1317				break;
1318			}
1319			break;
1320		default:
1321			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1322			r = -EINVAL;
1323			goto out_err;
1324		}
1325	}
1326
1327	if (shader->fs_write_all && rctx->chip_class >= EVERGREEN)
1328		shader->nr_ps_max_color_exports = 8;
1329
1330	if (ctx.fragcoord_input >= 0) {
1331		if (ctx.bc->chip_class == CAYMAN) {
1332			for (j = 0 ; j < 4; j++) {
1333				struct r600_bytecode_alu alu;
1334				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1335				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1336				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1337				alu.src[0].chan = 3;
1338
1339				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1340				alu.dst.chan = j;
1341				alu.dst.write = (j == 3);
1342				alu.last = 1;
1343				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1344					return r;
1345			}
1346		} else {
1347			struct r600_bytecode_alu alu;
1348			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1349			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1350			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1351			alu.src[0].chan = 3;
1352
1353			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1354			alu.dst.chan = 3;
1355			alu.dst.write = 1;
1356			alu.last = 1;
1357			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1358				return r;
1359		}
1360	}
1361
1362	if (shader->two_side && ctx.colors_used) {
1363		if ((r = process_twoside_color_inputs(&ctx)))
1364			return r;
1365	}
1366
1367	tgsi_parse_init(&ctx.parse, tokens);
1368	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1369		tgsi_parse_token(&ctx.parse);
1370		switch (ctx.parse.FullToken.Token.Type) {
1371		case TGSI_TOKEN_TYPE_INSTRUCTION:
1372			if (use_llvm) {
1373				continue;
1374			}
1375			r = tgsi_is_supported(&ctx);
1376			if (r)
1377				goto out_err;
1378			ctx.max_driver_temp_used = 0;
1379			/* reserve first tmp for everyone */
1380			r600_get_temp(&ctx);
1381
1382			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1383			if ((r = tgsi_split_constant(&ctx)))
1384				goto out_err;
1385			if ((r = tgsi_split_literal_constant(&ctx)))
1386				goto out_err;
1387			if (ctx.bc->chip_class == CAYMAN)
1388				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1389			else if (ctx.bc->chip_class >= EVERGREEN)
1390				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1391			else
1392				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1393			r = ctx.inst_info->process(&ctx);
1394			if (r)
1395				goto out_err;
1396			break;
1397		default:
1398			break;
1399		}
1400	}
1401
1402	/* Get instructions if we are using the LLVM backend. */
1403	if (use_llvm) {
1404		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1405		FREE(inst_bytes);
1406	}
1407
1408	noutput = shader->noutput;
1409
1410	if (ctx.clip_vertex_write) {
1411		/* need to convert a clipvertex write into clipdistance writes and not export
1412		   the clip vertex anymore */
1413
1414		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1415		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1416		shader->output[noutput].gpr = ctx.temp_reg;
1417		noutput++;
1418		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1419		shader->output[noutput].gpr = ctx.temp_reg+1;
1420		noutput++;
1421
1422		/* reset spi_sid for clipvertex output to avoid confusing spi */
1423		shader->output[ctx.cv_output].spi_sid = 0;
1424
1425		shader->clip_dist_write = 0xFF;
1426
1427		for (i = 0; i < 8; i++) {
1428			int oreg = i >> 2;
1429			int ochan = i & 3;
1430
1431			for (j = 0; j < 4; j++) {
1432				struct r600_bytecode_alu alu;
1433				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1434				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1435				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1436				alu.src[0].chan = j;
1437
1438				alu.src[1].sel = 512 + i;
1439				alu.src[1].kc_bank = 1;
1440				alu.src[1].chan = j;
1441
1442				alu.dst.sel = ctx.temp_reg + oreg;
1443				alu.dst.chan = j;
1444				alu.dst.write = (j == ochan);
1445				if (j == 3)
1446					alu.last = 1;
1447				r = r600_bytecode_add_alu(ctx.bc, &alu);
1448				if (r)
1449					return r;
1450			}
1451		}
1452	}
1453
1454	/* Add stream outputs. */
1455	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1456		for (i = 0; i < so.num_outputs; i++) {
1457			struct r600_bytecode_output output;
1458
1459			if (so.output[i].output_buffer >= 4) {
1460				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1461					 so.output[i].output_buffer);
1462				r = -EINVAL;
1463				goto out_err;
1464			}
1465			if (so.output[i].dst_offset < so.output[i].start_component) {
1466			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1467			   r = -EINVAL;
1468			   goto out_err;
1469			}
1470
1471			memset(&output, 0, sizeof(struct r600_bytecode_output));
1472			output.gpr = shader->output[so.output[i].register_index].gpr;
1473			output.elem_size = 0;
1474			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1475			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1476			output.burst_count = 1;
1477			output.barrier = 1;
1478			/* array_size is an upper limit for the burst_count
1479			 * with MEM_STREAM instructions */
1480			output.array_size = 0xFFF;
1481			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1482			if (ctx.bc->chip_class >= EVERGREEN) {
1483				switch (so.output[i].output_buffer) {
1484				case 0:
1485					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1486					break;
1487				case 1:
1488					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1489					break;
1490				case 2:
1491					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1492					break;
1493				case 3:
1494					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1495					break;
1496				}
1497			} else {
1498				switch (so.output[i].output_buffer) {
1499				case 0:
1500					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1501					break;
1502				case 1:
1503					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1504					break;
1505				case 2:
1506					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1507					break;
1508				case 3:
1509					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1510					break;
1511				}
1512			}
1513			r = r600_bytecode_add_output(ctx.bc, &output);
1514			if (r)
1515				goto out_err;
1516		}
1517	}
1518
1519	/* export output */
1520	for (i = 0, j = 0; i < noutput; i++, j++) {
1521		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1522		output[j].gpr = shader->output[i].gpr;
1523		output[j].elem_size = 3;
1524		output[j].swizzle_x = 0;
1525		output[j].swizzle_y = 1;
1526		output[j].swizzle_z = 2;
1527		output[j].swizzle_w = 3;
1528		output[j].burst_count = 1;
1529		output[j].barrier = 1;
1530		output[j].type = -1;
1531		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1532		switch (ctx.type) {
1533		case TGSI_PROCESSOR_VERTEX:
1534			switch (shader->output[i].name) {
1535			case TGSI_SEMANTIC_POSITION:
1536				output[j].array_base = next_pos_base++;
1537				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1538				break;
1539
1540			case TGSI_SEMANTIC_PSIZE:
1541				output[j].array_base = next_pos_base++;
1542				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1543				break;
1544			case TGSI_SEMANTIC_CLIPVERTEX:
1545				j--;
1546				break;
1547			case TGSI_SEMANTIC_CLIPDIST:
1548				output[j].array_base = next_pos_base++;
1549				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1550				/* spi_sid is 0 for clipdistance outputs that were generated
1551				 * for clipvertex - we don't need to pass them to PS */
1552				if (shader->output[i].spi_sid) {
1553					j++;
1554					/* duplicate it as PARAM to pass to the pixel shader */
1555					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1556					output[j].array_base = next_param_base++;
1557					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1558				}
1559				break;
1560			case TGSI_SEMANTIC_FOG:
1561				output[j].swizzle_y = 4; /* 0 */
1562				output[j].swizzle_z = 4; /* 0 */
1563				output[j].swizzle_w = 5; /* 1 */
1564				break;
1565			}
1566			break;
1567		case TGSI_PROCESSOR_FRAGMENT:
1568			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1569				/* never export more colors than the number of CBs */
1570				if (next_pixel_base && next_pixel_base >= (rctx->nr_cbufs + rctx->dual_src_blend * 1)) {
1571					/* skip export */
1572					j--;
1573					continue;
1574				}
1575				output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1576				output[j].array_base = next_pixel_base++;
1577				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1578				shader->nr_ps_color_exports++;
1579				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
1580					for (k = 1; k < rctx->nr_cbufs; k++) {
1581						j++;
1582						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1583						output[j].gpr = shader->output[i].gpr;
1584						output[j].elem_size = 3;
1585						output[j].swizzle_x = 0;
1586						output[j].swizzle_y = 1;
1587						output[j].swizzle_z = 2;
1588						output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1589						output[j].burst_count = 1;
1590						output[j].barrier = 1;
1591						output[j].array_base = next_pixel_base++;
1592						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1593						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1594						shader->nr_ps_color_exports++;
1595					}
1596				}
1597			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1598				output[j].array_base = 61;
1599				output[j].swizzle_x = 2;
1600				output[j].swizzle_y = 7;
1601				output[j].swizzle_z = output[j].swizzle_w = 7;
1602				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1603			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1604				output[j].array_base = 61;
1605				output[j].swizzle_x = 7;
1606				output[j].swizzle_y = 1;
1607				output[j].swizzle_z = output[j].swizzle_w = 7;
1608				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1609			} else {
1610				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1611				r = -EINVAL;
1612				goto out_err;
1613			}
1614			break;
1615		default:
1616			R600_ERR("unsupported processor type %d\n", ctx.type);
1617			r = -EINVAL;
1618			goto out_err;
1619		}
1620
1621		if (output[j].type==-1) {
1622			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1623			output[j].array_base = next_param_base++;
1624		}
1625	}
1626
1627	/* add fake param output for vertex shader if no param is exported */
1628	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1629			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1630			output[j].gpr = 0;
1631			output[j].elem_size = 3;
1632			output[j].swizzle_x = 7;
1633			output[j].swizzle_y = 7;
1634			output[j].swizzle_z = 7;
1635			output[j].swizzle_w = 7;
1636			output[j].burst_count = 1;
1637			output[j].barrier = 1;
1638			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1639			output[j].array_base = 0;
1640			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1641			j++;
1642	}
1643
1644	/* add fake pixel export */
1645	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1646		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1647		output[j].gpr = 0;
1648		output[j].elem_size = 3;
1649		output[j].swizzle_x = 7;
1650		output[j].swizzle_y = 7;
1651		output[j].swizzle_z = 7;
1652		output[j].swizzle_w = 7;
1653		output[j].burst_count = 1;
1654		output[j].barrier = 1;
1655		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1656		output[j].array_base = 0;
1657		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1658		j++;
1659	}
1660
1661	noutput = j;
1662
1663	/* set export done on last export of each type */
1664	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1665		if (ctx.bc->chip_class < CAYMAN) {
1666			if (i == (noutput - 1)) {
1667				output[i].end_of_program = 1;
1668			}
1669		}
1670		if (!(output_done & (1 << output[i].type))) {
1671			output_done |= (1 << output[i].type);
1672			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1673		}
1674	}
1675	/* add output to bytecode */
1676	for (i = 0; i < noutput; i++) {
1677		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1678		if (r)
1679			goto out_err;
1680	}
1681	/* add program end */
1682	if (ctx.bc->chip_class == CAYMAN)
1683		cm_bytecode_add_cf_end(ctx.bc);
1684
1685	/* check GPR limit - we have 124 = 128 - 4
1686	 * (4 are reserved as alu clause temporary registers) */
1687	if (ctx.bc->ngpr > 124) {
1688		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1689		r = -ENOMEM;
1690		goto out_err;
1691	}
1692
1693	free(ctx.literals);
1694	tgsi_parse_free(&ctx.parse);
1695	return 0;
1696out_err:
1697	free(ctx.literals);
1698	tgsi_parse_free(&ctx.parse);
1699	return r;
1700}
1701
1702static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1703{
1704	R600_ERR("%s tgsi opcode unsupported\n",
1705		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1706	return -EINVAL;
1707}
1708
1709static int tgsi_end(struct r600_shader_ctx *ctx)
1710{
1711	return 0;
1712}
1713
1714static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1715			const struct r600_shader_src *shader_src,
1716			unsigned chan)
1717{
1718	bc_src->sel = shader_src->sel;
1719	bc_src->chan = shader_src->swizzle[chan];
1720	bc_src->neg = shader_src->neg;
1721	bc_src->abs = shader_src->abs;
1722	bc_src->rel = shader_src->rel;
1723	bc_src->value = shader_src->value[bc_src->chan];
1724}
1725
1726static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1727{
1728	bc_src->abs = 1;
1729	bc_src->neg = 0;
1730}
1731
1732static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1733{
1734	bc_src->neg = !bc_src->neg;
1735}
1736
1737static void tgsi_dst(struct r600_shader_ctx *ctx,
1738		     const struct tgsi_full_dst_register *tgsi_dst,
1739		     unsigned swizzle,
1740		     struct r600_bytecode_alu_dst *r600_dst)
1741{
1742	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1743
1744	r600_dst->sel = tgsi_dst->Register.Index;
1745	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1746	r600_dst->chan = swizzle;
1747	r600_dst->write = 1;
1748	if (tgsi_dst->Register.Indirect)
1749		r600_dst->rel = V_SQ_REL_RELATIVE;
1750	if (inst->Instruction.Saturate) {
1751		r600_dst->clamp = 1;
1752	}
1753}
1754
1755static int tgsi_last_instruction(unsigned writemask)
1756{
1757	int i, lasti = 0;
1758
1759	for (i = 0; i < 4; i++) {
1760		if (writemask & (1 << i)) {
1761			lasti = i;
1762		}
1763	}
1764	return lasti;
1765}
1766
1767static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1768{
1769	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1770	struct r600_bytecode_alu alu;
1771	int i, j, r;
1772	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1773
1774	for (i = 0; i < lasti + 1; i++) {
1775		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1776			continue;
1777
1778		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1779		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1780
1781		alu.inst = ctx->inst_info->r600_opcode;
1782		if (!swap) {
1783			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1784				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1785			}
1786		} else {
1787			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1788			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1789		}
1790		/* handle some special cases */
1791		switch (ctx->inst_info->tgsi_opcode) {
1792		case TGSI_OPCODE_SUB:
1793			r600_bytecode_src_toggle_neg(&alu.src[1]);
1794			break;
1795		case TGSI_OPCODE_ABS:
1796			r600_bytecode_src_set_abs(&alu.src[0]);
1797			break;
1798		default:
1799			break;
1800		}
1801		if (i == lasti || trans_only) {
1802			alu.last = 1;
1803		}
1804		r = r600_bytecode_add_alu(ctx->bc, &alu);
1805		if (r)
1806			return r;
1807	}
1808	return 0;
1809}
1810
1811static int tgsi_op2(struct r600_shader_ctx *ctx)
1812{
1813	return tgsi_op2_s(ctx, 0, 0);
1814}
1815
1816static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1817{
1818	return tgsi_op2_s(ctx, 1, 0);
1819}
1820
1821static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1822{
1823	return tgsi_op2_s(ctx, 0, 1);
1824}
1825
1826static int tgsi_ineg(struct r600_shader_ctx *ctx)
1827{
1828	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1829	struct r600_bytecode_alu alu;
1830	int i, r;
1831	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1832
1833	for (i = 0; i < lasti + 1; i++) {
1834
1835		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1836			continue;
1837		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1838		alu.inst = ctx->inst_info->r600_opcode;
1839
1840		alu.src[0].sel = V_SQ_ALU_SRC_0;
1841
1842		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1843
1844		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1845
1846		if (i == lasti) {
1847			alu.last = 1;
1848		}
1849		r = r600_bytecode_add_alu(ctx->bc, &alu);
1850		if (r)
1851			return r;
1852	}
1853	return 0;
1854
1855}
1856
1857static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1858{
1859	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1860	int i, j, r;
1861	struct r600_bytecode_alu alu;
1862	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1863
1864	for (i = 0 ; i < last_slot; i++) {
1865		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1866		alu.inst = ctx->inst_info->r600_opcode;
1867		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1868			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1869		}
1870		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1871		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1872
1873		if (i == last_slot - 1)
1874			alu.last = 1;
1875		r = r600_bytecode_add_alu(ctx->bc, &alu);
1876		if (r)
1877			return r;
1878	}
1879	return 0;
1880}
1881
1882static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1883{
1884	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1885	int i, j, k, r;
1886	struct r600_bytecode_alu alu;
1887	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1888	for (k = 0; k < last_slot; k++) {
1889		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1890			continue;
1891
1892		for (i = 0 ; i < 4; i++) {
1893			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1894			alu.inst = ctx->inst_info->r600_opcode;
1895			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1896				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1897			}
1898			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1899			alu.dst.write = (i == k);
1900			if (i == 3)
1901				alu.last = 1;
1902			r = r600_bytecode_add_alu(ctx->bc, &alu);
1903			if (r)
1904				return r;
1905		}
1906	}
1907	return 0;
1908}
1909
1910/*
1911 * r600 - trunc to -PI..PI range
1912 * r700 - normalize by dividing by 2PI
1913 * see fdo bug 27901
1914 */
1915static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1916{
1917	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1918	static float double_pi = 3.1415926535 * 2;
1919	static float neg_pi = -3.1415926535;
1920
1921	int r;
1922	struct r600_bytecode_alu alu;
1923
1924	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1925	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1926	alu.is_op3 = 1;
1927
1928	alu.dst.chan = 0;
1929	alu.dst.sel = ctx->temp_reg;
1930	alu.dst.write = 1;
1931
1932	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1933
1934	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1935	alu.src[1].chan = 0;
1936	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1937	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1938	alu.src[2].chan = 0;
1939	alu.last = 1;
1940	r = r600_bytecode_add_alu(ctx->bc, &alu);
1941	if (r)
1942		return r;
1943
1944	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1945	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1946
1947	alu.dst.chan = 0;
1948	alu.dst.sel = ctx->temp_reg;
1949	alu.dst.write = 1;
1950
1951	alu.src[0].sel = ctx->temp_reg;
1952	alu.src[0].chan = 0;
1953	alu.last = 1;
1954	r = r600_bytecode_add_alu(ctx->bc, &alu);
1955	if (r)
1956		return r;
1957
1958	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1959	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1960	alu.is_op3 = 1;
1961
1962	alu.dst.chan = 0;
1963	alu.dst.sel = ctx->temp_reg;
1964	alu.dst.write = 1;
1965
1966	alu.src[0].sel = ctx->temp_reg;
1967	alu.src[0].chan = 0;
1968
1969	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1970	alu.src[1].chan = 0;
1971	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
1972	alu.src[2].chan = 0;
1973
1974	if (ctx->bc->chip_class == R600) {
1975		alu.src[1].value = *(uint32_t *)&double_pi;
1976		alu.src[2].value = *(uint32_t *)&neg_pi;
1977	} else {
1978		alu.src[1].sel = V_SQ_ALU_SRC_1;
1979		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1980		alu.src[2].neg = 1;
1981	}
1982
1983	alu.last = 1;
1984	r = r600_bytecode_add_alu(ctx->bc, &alu);
1985	if (r)
1986		return r;
1987	return 0;
1988}
1989
1990static int cayman_trig(struct r600_shader_ctx *ctx)
1991{
1992	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1993	struct r600_bytecode_alu alu;
1994	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1995	int i, r;
1996
1997	r = tgsi_setup_trig(ctx);
1998	if (r)
1999		return r;
2000
2001
2002	for (i = 0; i < last_slot; i++) {
2003		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2004		alu.inst = ctx->inst_info->r600_opcode;
2005		alu.dst.chan = i;
2006
2007		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2008		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2009
2010		alu.src[0].sel = ctx->temp_reg;
2011		alu.src[0].chan = 0;
2012		if (i == last_slot - 1)
2013			alu.last = 1;
2014		r = r600_bytecode_add_alu(ctx->bc, &alu);
2015		if (r)
2016			return r;
2017	}
2018	return 0;
2019}
2020
2021static int tgsi_trig(struct r600_shader_ctx *ctx)
2022{
2023	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2024	struct r600_bytecode_alu alu;
2025	int i, r;
2026	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2027
2028	r = tgsi_setup_trig(ctx);
2029	if (r)
2030		return r;
2031
2032	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2033	alu.inst = ctx->inst_info->r600_opcode;
2034	alu.dst.chan = 0;
2035	alu.dst.sel = ctx->temp_reg;
2036	alu.dst.write = 1;
2037
2038	alu.src[0].sel = ctx->temp_reg;
2039	alu.src[0].chan = 0;
2040	alu.last = 1;
2041	r = r600_bytecode_add_alu(ctx->bc, &alu);
2042	if (r)
2043		return r;
2044
2045	/* replicate result */
2046	for (i = 0; i < lasti + 1; i++) {
2047		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2048			continue;
2049
2050		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2051		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2052
2053		alu.src[0].sel = ctx->temp_reg;
2054		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2055		if (i == lasti)
2056			alu.last = 1;
2057		r = r600_bytecode_add_alu(ctx->bc, &alu);
2058		if (r)
2059			return r;
2060	}
2061	return 0;
2062}
2063
2064static int tgsi_scs(struct r600_shader_ctx *ctx)
2065{
2066	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2067	struct r600_bytecode_alu alu;
2068	int i, r;
2069
2070	/* We'll only need the trig stuff if we are going to write to the
2071	 * X or Y components of the destination vector.
2072	 */
2073	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2074		r = tgsi_setup_trig(ctx);
2075		if (r)
2076			return r;
2077	}
2078
2079	/* dst.x = COS */
2080	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2081		if (ctx->bc->chip_class == CAYMAN) {
2082			for (i = 0 ; i < 3; i++) {
2083				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2084				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2085				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2086
2087				if (i == 0)
2088					alu.dst.write = 1;
2089				else
2090					alu.dst.write = 0;
2091				alu.src[0].sel = ctx->temp_reg;
2092				alu.src[0].chan = 0;
2093				if (i == 2)
2094					alu.last = 1;
2095				r = r600_bytecode_add_alu(ctx->bc, &alu);
2096				if (r)
2097					return r;
2098			}
2099		} else {
2100			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2101			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2102			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2103
2104			alu.src[0].sel = ctx->temp_reg;
2105			alu.src[0].chan = 0;
2106			alu.last = 1;
2107			r = r600_bytecode_add_alu(ctx->bc, &alu);
2108			if (r)
2109				return r;
2110		}
2111	}
2112
2113	/* dst.y = SIN */
2114	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2115		if (ctx->bc->chip_class == CAYMAN) {
2116			for (i = 0 ; i < 3; i++) {
2117				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2118				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2119				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2120				if (i == 1)
2121					alu.dst.write = 1;
2122				else
2123					alu.dst.write = 0;
2124				alu.src[0].sel = ctx->temp_reg;
2125				alu.src[0].chan = 0;
2126				if (i == 2)
2127					alu.last = 1;
2128				r = r600_bytecode_add_alu(ctx->bc, &alu);
2129				if (r)
2130					return r;
2131			}
2132		} else {
2133			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2134			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2135			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2136
2137			alu.src[0].sel = ctx->temp_reg;
2138			alu.src[0].chan = 0;
2139			alu.last = 1;
2140			r = r600_bytecode_add_alu(ctx->bc, &alu);
2141			if (r)
2142				return r;
2143		}
2144	}
2145
2146	/* dst.z = 0.0; */
2147	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2148		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2149
2150		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2151
2152		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2153
2154		alu.src[0].sel = V_SQ_ALU_SRC_0;
2155		alu.src[0].chan = 0;
2156
2157		alu.last = 1;
2158
2159		r = r600_bytecode_add_alu(ctx->bc, &alu);
2160		if (r)
2161			return r;
2162	}
2163
2164	/* dst.w = 1.0; */
2165	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2166		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2167
2168		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2169
2170		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2171
2172		alu.src[0].sel = V_SQ_ALU_SRC_1;
2173		alu.src[0].chan = 0;
2174
2175		alu.last = 1;
2176
2177		r = r600_bytecode_add_alu(ctx->bc, &alu);
2178		if (r)
2179			return r;
2180	}
2181
2182	return 0;
2183}
2184
2185static int tgsi_kill(struct r600_shader_ctx *ctx)
2186{
2187	struct r600_bytecode_alu alu;
2188	int i, r;
2189
2190	for (i = 0; i < 4; i++) {
2191		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2192		alu.inst = ctx->inst_info->r600_opcode;
2193
2194		alu.dst.chan = i;
2195
2196		alu.src[0].sel = V_SQ_ALU_SRC_0;
2197
2198		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2199			alu.src[1].sel = V_SQ_ALU_SRC_1;
2200			alu.src[1].neg = 1;
2201		} else {
2202			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2203		}
2204		if (i == 3) {
2205			alu.last = 1;
2206		}
2207		r = r600_bytecode_add_alu(ctx->bc, &alu);
2208		if (r)
2209			return r;
2210	}
2211
2212	/* kill must be last in ALU */
2213	ctx->bc->force_add_cf = 1;
2214	ctx->shader->uses_kill = TRUE;
2215	return 0;
2216}
2217
2218static int tgsi_lit(struct r600_shader_ctx *ctx)
2219{
2220	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2221	struct r600_bytecode_alu alu;
2222	int r;
2223
2224	/* tmp.x = max(src.y, 0.0) */
2225	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2226	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2227	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2228	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2229	alu.src[1].chan = 1;
2230
2231	alu.dst.sel = ctx->temp_reg;
2232	alu.dst.chan = 0;
2233	alu.dst.write = 1;
2234
2235	alu.last = 1;
2236	r = r600_bytecode_add_alu(ctx->bc, &alu);
2237	if (r)
2238		return r;
2239
2240	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2241	{
2242		int chan;
2243		int sel;
2244		int i;
2245
2246		if (ctx->bc->chip_class == CAYMAN) {
2247			for (i = 0; i < 3; i++) {
2248				/* tmp.z = log(tmp.x) */
2249				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2250				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2251				alu.src[0].sel = ctx->temp_reg;
2252				alu.src[0].chan = 0;
2253				alu.dst.sel = ctx->temp_reg;
2254				alu.dst.chan = i;
2255				if (i == 2) {
2256					alu.dst.write = 1;
2257					alu.last = 1;
2258				} else
2259					alu.dst.write = 0;
2260
2261				r = r600_bytecode_add_alu(ctx->bc, &alu);
2262				if (r)
2263					return r;
2264			}
2265		} else {
2266			/* tmp.z = log(tmp.x) */
2267			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2268			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2269			alu.src[0].sel = ctx->temp_reg;
2270			alu.src[0].chan = 0;
2271			alu.dst.sel = ctx->temp_reg;
2272			alu.dst.chan = 2;
2273			alu.dst.write = 1;
2274			alu.last = 1;
2275			r = r600_bytecode_add_alu(ctx->bc, &alu);
2276			if (r)
2277				return r;
2278		}
2279
2280		chan = alu.dst.chan;
2281		sel = alu.dst.sel;
2282
2283		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2284		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2285		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2286		alu.src[0].sel  = sel;
2287		alu.src[0].chan = chan;
2288		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2289		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2290		alu.dst.sel = ctx->temp_reg;
2291		alu.dst.chan = 0;
2292		alu.dst.write = 1;
2293		alu.is_op3 = 1;
2294		alu.last = 1;
2295		r = r600_bytecode_add_alu(ctx->bc, &alu);
2296		if (r)
2297			return r;
2298
2299		if (ctx->bc->chip_class == CAYMAN) {
2300			for (i = 0; i < 3; i++) {
2301				/* dst.z = exp(tmp.x) */
2302				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2303				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2304				alu.src[0].sel = ctx->temp_reg;
2305				alu.src[0].chan = 0;
2306				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2307				if (i == 2) {
2308					alu.dst.write = 1;
2309					alu.last = 1;
2310				} else
2311					alu.dst.write = 0;
2312				r = r600_bytecode_add_alu(ctx->bc, &alu);
2313				if (r)
2314					return r;
2315			}
2316		} else {
2317			/* dst.z = exp(tmp.x) */
2318			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2319			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2320			alu.src[0].sel = ctx->temp_reg;
2321			alu.src[0].chan = 0;
2322			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2323			alu.last = 1;
2324			r = r600_bytecode_add_alu(ctx->bc, &alu);
2325			if (r)
2326				return r;
2327		}
2328	}
2329
2330	/* dst.x, <- 1.0  */
2331	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2332	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2333	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2334	alu.src[0].chan = 0;
2335	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2336	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2337	r = r600_bytecode_add_alu(ctx->bc, &alu);
2338	if (r)
2339		return r;
2340
2341	/* dst.y = max(src.x, 0.0) */
2342	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2343	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2344	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2345	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2346	alu.src[1].chan = 0;
2347	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2348	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2349	r = r600_bytecode_add_alu(ctx->bc, &alu);
2350	if (r)
2351		return r;
2352
2353	/* dst.w, <- 1.0  */
2354	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2355	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2356	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2357	alu.src[0].chan = 0;
2358	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2359	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2360	alu.last = 1;
2361	r = r600_bytecode_add_alu(ctx->bc, &alu);
2362	if (r)
2363		return r;
2364
2365	return 0;
2366}
2367
2368static int tgsi_rsq(struct r600_shader_ctx *ctx)
2369{
2370	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2371	struct r600_bytecode_alu alu;
2372	int i, r;
2373
2374	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2375
2376	/* XXX:
2377	 * For state trackers other than OpenGL, we'll want to use
2378	 * _RECIPSQRT_IEEE instead.
2379	 */
2380	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2381
2382	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2383		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2384		r600_bytecode_src_set_abs(&alu.src[i]);
2385	}
2386	alu.dst.sel = ctx->temp_reg;
2387	alu.dst.write = 1;
2388	alu.last = 1;
2389	r = r600_bytecode_add_alu(ctx->bc, &alu);
2390	if (r)
2391		return r;
2392	/* replicate result */
2393	return tgsi_helper_tempx_replicate(ctx);
2394}
2395
2396static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2397{
2398	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2399	struct r600_bytecode_alu alu;
2400	int i, r;
2401
2402	for (i = 0; i < 4; i++) {
2403		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2404		alu.src[0].sel = ctx->temp_reg;
2405		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2406		alu.dst.chan = i;
2407		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2408		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2409		if (i == 3)
2410			alu.last = 1;
2411		r = r600_bytecode_add_alu(ctx->bc, &alu);
2412		if (r)
2413			return r;
2414	}
2415	return 0;
2416}
2417
2418static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2419{
2420	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2421	struct r600_bytecode_alu alu;
2422	int i, r;
2423
2424	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2425	alu.inst = ctx->inst_info->r600_opcode;
2426	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2427		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2428	}
2429	alu.dst.sel = ctx->temp_reg;
2430	alu.dst.write = 1;
2431	alu.last = 1;
2432	r = r600_bytecode_add_alu(ctx->bc, &alu);
2433	if (r)
2434		return r;
2435	/* replicate result */
2436	return tgsi_helper_tempx_replicate(ctx);
2437}
2438
2439static int cayman_pow(struct r600_shader_ctx *ctx)
2440{
2441	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2442	int i, r;
2443	struct r600_bytecode_alu alu;
2444	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2445
2446	for (i = 0; i < 3; i++) {
2447		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2448		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2449		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2450		alu.dst.sel = ctx->temp_reg;
2451		alu.dst.chan = i;
2452		alu.dst.write = 1;
2453		if (i == 2)
2454			alu.last = 1;
2455		r = r600_bytecode_add_alu(ctx->bc, &alu);
2456		if (r)
2457			return r;
2458	}
2459
2460	/* b * LOG2(a) */
2461	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2462	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2463	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2464	alu.src[1].sel = ctx->temp_reg;
2465	alu.dst.sel = ctx->temp_reg;
2466	alu.dst.write = 1;
2467	alu.last = 1;
2468	r = r600_bytecode_add_alu(ctx->bc, &alu);
2469	if (r)
2470		return r;
2471
2472	for (i = 0; i < last_slot; i++) {
2473		/* POW(a,b) = EXP2(b * LOG2(a))*/
2474		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2475		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2476		alu.src[0].sel = ctx->temp_reg;
2477
2478		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2479		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2480		if (i == last_slot - 1)
2481			alu.last = 1;
2482		r = r600_bytecode_add_alu(ctx->bc, &alu);
2483		if (r)
2484			return r;
2485	}
2486	return 0;
2487}
2488
2489static int tgsi_pow(struct r600_shader_ctx *ctx)
2490{
2491	struct r600_bytecode_alu alu;
2492	int r;
2493
2494	/* LOG2(a) */
2495	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2496	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2497	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2498	alu.dst.sel = ctx->temp_reg;
2499	alu.dst.write = 1;
2500	alu.last = 1;
2501	r = r600_bytecode_add_alu(ctx->bc, &alu);
2502	if (r)
2503		return r;
2504	/* b * LOG2(a) */
2505	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2506	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2507	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2508	alu.src[1].sel = ctx->temp_reg;
2509	alu.dst.sel = ctx->temp_reg;
2510	alu.dst.write = 1;
2511	alu.last = 1;
2512	r = r600_bytecode_add_alu(ctx->bc, &alu);
2513	if (r)
2514		return r;
2515	/* POW(a,b) = EXP2(b * LOG2(a))*/
2516	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2517	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2518	alu.src[0].sel = ctx->temp_reg;
2519	alu.dst.sel = ctx->temp_reg;
2520	alu.dst.write = 1;
2521	alu.last = 1;
2522	r = r600_bytecode_add_alu(ctx->bc, &alu);
2523	if (r)
2524		return r;
2525	return tgsi_helper_tempx_replicate(ctx);
2526}
2527
2528static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2529{
2530	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2531	struct r600_bytecode_alu alu;
2532	int i, r, j;
2533	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2534	int tmp0 = ctx->temp_reg;
2535	int tmp1 = r600_get_temp(ctx);
2536	int tmp2 = r600_get_temp(ctx);
2537	int tmp3 = r600_get_temp(ctx);
2538	/* Unsigned path:
2539	 *
2540	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2541	 *
2542	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2543	 * 2. tmp0.z = lo (tmp0.x * src2)
2544	 * 3. tmp0.w = -tmp0.z
2545	 * 4. tmp0.y = hi (tmp0.x * src2)
2546	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2547	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2548	 * 7. tmp1.x = tmp0.x - tmp0.w
2549	 * 8. tmp1.y = tmp0.x + tmp0.w
2550	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2551	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2552	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2553	 *
2554	 * 12. tmp0.w = src1 - tmp0.y       = r
2555	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2556	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2557	 *
2558	 * if DIV
2559	 *
2560	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2561	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2562	 *
2563	 * else MOD
2564	 *
2565	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2566	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2567	 *
2568	 * endif
2569	 *
2570	 * 17. tmp1.x = tmp1.x & tmp1.y
2571	 *
2572	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2573	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2574	 *
2575	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2576	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2577	 *
2578	 * Signed path:
2579	 *
2580	 * Same as unsigned, using abs values of the operands,
2581	 * and fixing the sign of the result in the end.
2582	 */
2583
2584	for (i = 0; i < 4; i++) {
2585		if (!(write_mask & (1<<i)))
2586			continue;
2587
2588		if (signed_op) {
2589
2590			/* tmp2.x = -src0 */
2591			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2592			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2593
2594			alu.dst.sel = tmp2;
2595			alu.dst.chan = 0;
2596			alu.dst.write = 1;
2597
2598			alu.src[0].sel = V_SQ_ALU_SRC_0;
2599
2600			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2601
2602			alu.last = 1;
2603			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2604				return r;
2605
2606			/* tmp2.y = -src1 */
2607			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2608			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2609
2610			alu.dst.sel = tmp2;
2611			alu.dst.chan = 1;
2612			alu.dst.write = 1;
2613
2614			alu.src[0].sel = V_SQ_ALU_SRC_0;
2615
2616			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2617
2618			alu.last = 1;
2619			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2620				return r;
2621
2622			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2623			/* it will be a sign of the quotient */
2624			if (!mod) {
2625
2626				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2627				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2628
2629				alu.dst.sel = tmp2;
2630				alu.dst.chan = 2;
2631				alu.dst.write = 1;
2632
2633				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2634				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2635
2636				alu.last = 1;
2637				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2638					return r;
2639			}
2640
2641			/* tmp2.x = |src0| */
2642			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2643			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2644			alu.is_op3 = 1;
2645
2646			alu.dst.sel = tmp2;
2647			alu.dst.chan = 0;
2648			alu.dst.write = 1;
2649
2650			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2651			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2652			alu.src[2].sel = tmp2;
2653			alu.src[2].chan = 0;
2654
2655			alu.last = 1;
2656			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2657				return r;
2658
2659			/* tmp2.y = |src1| */
2660			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2661			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2662			alu.is_op3 = 1;
2663
2664			alu.dst.sel = tmp2;
2665			alu.dst.chan = 1;
2666			alu.dst.write = 1;
2667
2668			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2669			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2670			alu.src[2].sel = tmp2;
2671			alu.src[2].chan = 1;
2672
2673			alu.last = 1;
2674			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2675				return r;
2676
2677		}
2678
2679		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2680		if (ctx->bc->chip_class == CAYMAN) {
2681			/* tmp3.x = u2f(src2) */
2682			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2683			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2684
2685			alu.dst.sel = tmp3;
2686			alu.dst.chan = 0;
2687			alu.dst.write = 1;
2688
2689			if (signed_op) {
2690				alu.src[0].sel = tmp2;
2691				alu.src[0].chan = 1;
2692			} else {
2693				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2694			}
2695
2696			alu.last = 1;
2697			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2698				return r;
2699
2700			/* tmp0.x = recip(tmp3.x) */
2701			for (j = 0 ; j < 3; j++) {
2702				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2703				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2704
2705				alu.dst.sel = tmp0;
2706				alu.dst.chan = j;
2707				alu.dst.write = (j == 0);
2708
2709				alu.src[0].sel = tmp3;
2710				alu.src[0].chan = 0;
2711
2712				if (j == 2)
2713					alu.last = 1;
2714				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2715					return r;
2716			}
2717
2718			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2719			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2720
2721			alu.src[0].sel = tmp0;
2722			alu.src[0].chan = 0;
2723
2724			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2725			alu.src[1].value = 0x4f800000;
2726
2727			alu.dst.sel = tmp3;
2728			alu.dst.write = 1;
2729			alu.last = 1;
2730			r = r600_bytecode_add_alu(ctx->bc, &alu);
2731			if (r)
2732				return r;
2733
2734			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2735			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2736
2737			alu.dst.sel = tmp0;
2738			alu.dst.chan = 0;
2739			alu.dst.write = 1;
2740
2741			alu.src[0].sel = tmp3;
2742			alu.src[0].chan = 0;
2743
2744			alu.last = 1;
2745			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2746				return r;
2747
2748		} else {
2749			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2750			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2751
2752			alu.dst.sel = tmp0;
2753			alu.dst.chan = 0;
2754			alu.dst.write = 1;
2755
2756			if (signed_op) {
2757				alu.src[0].sel = tmp2;
2758				alu.src[0].chan = 1;
2759			} else {
2760				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2761			}
2762
2763			alu.last = 1;
2764			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2765				return r;
2766		}
2767
2768		/* 2. tmp0.z = lo (tmp0.x * src2) */
2769		if (ctx->bc->chip_class == CAYMAN) {
2770			for (j = 0 ; j < 4; j++) {
2771				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2772				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2773
2774				alu.dst.sel = tmp0;
2775				alu.dst.chan = j;
2776				alu.dst.write = (j == 2);
2777
2778				alu.src[0].sel = tmp0;
2779				alu.src[0].chan = 0;
2780				if (signed_op) {
2781					alu.src[1].sel = tmp2;
2782					alu.src[1].chan = 1;
2783				} else {
2784					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2785				}
2786
2787				alu.last = (j == 3);
2788				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2789					return r;
2790			}
2791		} else {
2792			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2793			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2794
2795			alu.dst.sel = tmp0;
2796			alu.dst.chan = 2;
2797			alu.dst.write = 1;
2798
2799			alu.src[0].sel = tmp0;
2800			alu.src[0].chan = 0;
2801			if (signed_op) {
2802				alu.src[1].sel = tmp2;
2803				alu.src[1].chan = 1;
2804			} else {
2805				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2806			}
2807
2808			alu.last = 1;
2809			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2810				return r;
2811		}
2812
2813		/* 3. tmp0.w = -tmp0.z */
2814		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2815		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2816
2817		alu.dst.sel = tmp0;
2818		alu.dst.chan = 3;
2819		alu.dst.write = 1;
2820
2821		alu.src[0].sel = V_SQ_ALU_SRC_0;
2822		alu.src[1].sel = tmp0;
2823		alu.src[1].chan = 2;
2824
2825		alu.last = 1;
2826		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2827			return r;
2828
2829		/* 4. tmp0.y = hi (tmp0.x * src2) */
2830		if (ctx->bc->chip_class == CAYMAN) {
2831			for (j = 0 ; j < 4; j++) {
2832				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2833				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2834
2835				alu.dst.sel = tmp0;
2836				alu.dst.chan = j;
2837				alu.dst.write = (j == 1);
2838
2839				alu.src[0].sel = tmp0;
2840				alu.src[0].chan = 0;
2841
2842				if (signed_op) {
2843					alu.src[1].sel = tmp2;
2844					alu.src[1].chan = 1;
2845				} else {
2846					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2847				}
2848				alu.last = (j == 3);
2849				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2850					return r;
2851			}
2852		} else {
2853			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2854			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2855
2856			alu.dst.sel = tmp0;
2857			alu.dst.chan = 1;
2858			alu.dst.write = 1;
2859
2860			alu.src[0].sel = tmp0;
2861			alu.src[0].chan = 0;
2862
2863			if (signed_op) {
2864				alu.src[1].sel = tmp2;
2865				alu.src[1].chan = 1;
2866			} else {
2867				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2868			}
2869
2870			alu.last = 1;
2871			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2872				return r;
2873		}
2874
2875		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2876		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2877		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2878		alu.is_op3 = 1;
2879
2880		alu.dst.sel = tmp0;
2881		alu.dst.chan = 2;
2882		alu.dst.write = 1;
2883
2884		alu.src[0].sel = tmp0;
2885		alu.src[0].chan = 1;
2886		alu.src[1].sel = tmp0;
2887		alu.src[1].chan = 3;
2888		alu.src[2].sel = tmp0;
2889		alu.src[2].chan = 2;
2890
2891		alu.last = 1;
2892		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2893			return r;
2894
2895		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2896		if (ctx->bc->chip_class == CAYMAN) {
2897			for (j = 0 ; j < 4; j++) {
2898				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2899				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2900
2901				alu.dst.sel = tmp0;
2902				alu.dst.chan = j;
2903				alu.dst.write = (j == 3);
2904
2905				alu.src[0].sel = tmp0;
2906				alu.src[0].chan = 2;
2907
2908				alu.src[1].sel = tmp0;
2909				alu.src[1].chan = 0;
2910
2911				alu.last = (j == 3);
2912				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2913					return r;
2914			}
2915		} else {
2916			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2917			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2918
2919			alu.dst.sel = tmp0;
2920			alu.dst.chan = 3;
2921			alu.dst.write = 1;
2922
2923			alu.src[0].sel = tmp0;
2924			alu.src[0].chan = 2;
2925
2926			alu.src[1].sel = tmp0;
2927			alu.src[1].chan = 0;
2928
2929			alu.last = 1;
2930			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2931				return r;
2932		}
2933
2934		/* 7. tmp1.x = tmp0.x - tmp0.w */
2935		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2936		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2937
2938		alu.dst.sel = tmp1;
2939		alu.dst.chan = 0;
2940		alu.dst.write = 1;
2941
2942		alu.src[0].sel = tmp0;
2943		alu.src[0].chan = 0;
2944		alu.src[1].sel = tmp0;
2945		alu.src[1].chan = 3;
2946
2947		alu.last = 1;
2948		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2949			return r;
2950
2951		/* 8. tmp1.y = tmp0.x + tmp0.w */
2952		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2953		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
2954
2955		alu.dst.sel = tmp1;
2956		alu.dst.chan = 1;
2957		alu.dst.write = 1;
2958
2959		alu.src[0].sel = tmp0;
2960		alu.src[0].chan = 0;
2961		alu.src[1].sel = tmp0;
2962		alu.src[1].chan = 3;
2963
2964		alu.last = 1;
2965		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2966			return r;
2967
2968		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
2969		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2970		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2971		alu.is_op3 = 1;
2972
2973		alu.dst.sel = tmp0;
2974		alu.dst.chan = 0;
2975		alu.dst.write = 1;
2976
2977		alu.src[0].sel = tmp0;
2978		alu.src[0].chan = 1;
2979		alu.src[1].sel = tmp1;
2980		alu.src[1].chan = 1;
2981		alu.src[2].sel = tmp1;
2982		alu.src[2].chan = 0;
2983
2984		alu.last = 1;
2985		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2986			return r;
2987
2988		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
2989		if (ctx->bc->chip_class == CAYMAN) {
2990			for (j = 0 ; j < 4; j++) {
2991				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2992				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2993
2994				alu.dst.sel = tmp0;
2995				alu.dst.chan = j;
2996				alu.dst.write = (j == 2);
2997
2998				alu.src[0].sel = tmp0;
2999				alu.src[0].chan = 0;
3000
3001				if (signed_op) {
3002					alu.src[1].sel = tmp2;
3003					alu.src[1].chan = 0;
3004				} else {
3005					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3006				}
3007
3008				alu.last = (j == 3);
3009				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3010					return r;
3011			}
3012		} else {
3013			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3014			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3015
3016			alu.dst.sel = tmp0;
3017			alu.dst.chan = 2;
3018			alu.dst.write = 1;
3019
3020			alu.src[0].sel = tmp0;
3021			alu.src[0].chan = 0;
3022
3023			if (signed_op) {
3024				alu.src[1].sel = tmp2;
3025				alu.src[1].chan = 0;
3026			} else {
3027				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3028			}
3029
3030			alu.last = 1;
3031			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3032				return r;
3033		}
3034
3035		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3036		if (ctx->bc->chip_class == CAYMAN) {
3037			for (j = 0 ; j < 4; j++) {
3038				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3039				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3040
3041				alu.dst.sel = tmp0;
3042				alu.dst.chan = j;
3043				alu.dst.write = (j == 1);
3044
3045				if (signed_op) {
3046					alu.src[0].sel = tmp2;
3047					alu.src[0].chan = 1;
3048				} else {
3049					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3050				}
3051
3052				alu.src[1].sel = tmp0;
3053				alu.src[1].chan = 2;
3054
3055				alu.last = (j == 3);
3056				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3057					return r;
3058			}
3059		} else {
3060			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3061			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3062
3063			alu.dst.sel = tmp0;
3064			alu.dst.chan = 1;
3065			alu.dst.write = 1;
3066
3067			if (signed_op) {
3068				alu.src[0].sel = tmp2;
3069				alu.src[0].chan = 1;
3070			} else {
3071				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3072			}
3073
3074			alu.src[1].sel = tmp0;
3075			alu.src[1].chan = 2;
3076
3077			alu.last = 1;
3078			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3079				return r;
3080		}
3081
3082		/* 12. tmp0.w = src1 - tmp0.y       = r */
3083		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3084		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3085
3086		alu.dst.sel = tmp0;
3087		alu.dst.chan = 3;
3088		alu.dst.write = 1;
3089
3090		if (signed_op) {
3091			alu.src[0].sel = tmp2;
3092			alu.src[0].chan = 0;
3093		} else {
3094			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3095		}
3096
3097		alu.src[1].sel = tmp0;
3098		alu.src[1].chan = 1;
3099
3100		alu.last = 1;
3101		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3102			return r;
3103
3104		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3105		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3106		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3107
3108		alu.dst.sel = tmp1;
3109		alu.dst.chan = 0;
3110		alu.dst.write = 1;
3111
3112		alu.src[0].sel = tmp0;
3113		alu.src[0].chan = 3;
3114		if (signed_op) {
3115			alu.src[1].sel = tmp2;
3116			alu.src[1].chan = 1;
3117		} else {
3118			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3119		}
3120
3121		alu.last = 1;
3122		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3123			return r;
3124
3125		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3126		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3127		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3128
3129		alu.dst.sel = tmp1;
3130		alu.dst.chan = 1;
3131		alu.dst.write = 1;
3132
3133		if (signed_op) {
3134			alu.src[0].sel = tmp2;
3135			alu.src[0].chan = 0;
3136		} else {
3137			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3138		}
3139
3140		alu.src[1].sel = tmp0;
3141		alu.src[1].chan = 1;
3142
3143		alu.last = 1;
3144		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3145			return r;
3146
3147		if (mod) { /* UMOD */
3148
3149			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3150			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3151			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3152
3153			alu.dst.sel = tmp1;
3154			alu.dst.chan = 2;
3155			alu.dst.write = 1;
3156
3157			alu.src[0].sel = tmp0;
3158			alu.src[0].chan = 3;
3159
3160			if (signed_op) {
3161				alu.src[1].sel = tmp2;
3162				alu.src[1].chan = 1;
3163			} else {
3164				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3165			}
3166
3167			alu.last = 1;
3168			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3169				return r;
3170
3171			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3172			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3173			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3174
3175			alu.dst.sel = tmp1;
3176			alu.dst.chan = 3;
3177			alu.dst.write = 1;
3178
3179			alu.src[0].sel = tmp0;
3180			alu.src[0].chan = 3;
3181			if (signed_op) {
3182				alu.src[1].sel = tmp2;
3183				alu.src[1].chan = 1;
3184			} else {
3185				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3186			}
3187
3188			alu.last = 1;
3189			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3190				return r;
3191
3192		} else { /* UDIV */
3193
3194			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3195			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3196			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3197
3198			alu.dst.sel = tmp1;
3199			alu.dst.chan = 2;
3200			alu.dst.write = 1;
3201
3202			alu.src[0].sel = tmp0;
3203			alu.src[0].chan = 2;
3204			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3205
3206			alu.last = 1;
3207			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3208				return r;
3209
3210			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3211			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3212			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3213
3214			alu.dst.sel = tmp1;
3215			alu.dst.chan = 3;
3216			alu.dst.write = 1;
3217
3218			alu.src[0].sel = tmp0;
3219			alu.src[0].chan = 2;
3220			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3221
3222			alu.last = 1;
3223			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3224				return r;
3225
3226		}
3227
3228		/* 17. tmp1.x = tmp1.x & tmp1.y */
3229		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3230		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3231
3232		alu.dst.sel = tmp1;
3233		alu.dst.chan = 0;
3234		alu.dst.write = 1;
3235
3236		alu.src[0].sel = tmp1;
3237		alu.src[0].chan = 0;
3238		alu.src[1].sel = tmp1;
3239		alu.src[1].chan = 1;
3240
3241		alu.last = 1;
3242		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3243			return r;
3244
3245		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3246		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3247		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3248		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3249		alu.is_op3 = 1;
3250
3251		alu.dst.sel = tmp0;
3252		alu.dst.chan = 2;
3253		alu.dst.write = 1;
3254
3255		alu.src[0].sel = tmp1;
3256		alu.src[0].chan = 0;
3257		alu.src[1].sel = tmp0;
3258		alu.src[1].chan = mod ? 3 : 2;
3259		alu.src[2].sel = tmp1;
3260		alu.src[2].chan = 2;
3261
3262		alu.last = 1;
3263		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3264			return r;
3265
3266		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3267		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3268		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3269		alu.is_op3 = 1;
3270
3271		if (signed_op) {
3272			alu.dst.sel = tmp0;
3273			alu.dst.chan = 2;
3274			alu.dst.write = 1;
3275		} else {
3276			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3277		}
3278
3279		alu.src[0].sel = tmp1;
3280		alu.src[0].chan = 1;
3281		alu.src[1].sel = tmp1;
3282		alu.src[1].chan = 3;
3283		alu.src[2].sel = tmp0;
3284		alu.src[2].chan = 2;
3285
3286		alu.last = 1;
3287		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3288			return r;
3289
3290		if (signed_op) {
3291
3292			/* fix the sign of the result */
3293
3294			if (mod) {
3295
3296				/* tmp0.x = -tmp0.z */
3297				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3298				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3299
3300				alu.dst.sel = tmp0;
3301				alu.dst.chan = 0;
3302				alu.dst.write = 1;
3303
3304				alu.src[0].sel = V_SQ_ALU_SRC_0;
3305				alu.src[1].sel = tmp0;
3306				alu.src[1].chan = 2;
3307
3308				alu.last = 1;
3309				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3310					return r;
3311
3312				/* sign of the remainder is the same as the sign of src0 */
3313				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3314				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3315				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3316				alu.is_op3 = 1;
3317
3318				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3319
3320				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3321				alu.src[1].sel = tmp0;
3322				alu.src[1].chan = 2;
3323				alu.src[2].sel = tmp0;
3324				alu.src[2].chan = 0;
3325
3326				alu.last = 1;
3327				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3328					return r;
3329
3330			} else {
3331
3332				/* tmp0.x = -tmp0.z */
3333				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3334				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3335
3336				alu.dst.sel = tmp0;
3337				alu.dst.chan = 0;
3338				alu.dst.write = 1;
3339
3340				alu.src[0].sel = V_SQ_ALU_SRC_0;
3341				alu.src[1].sel = tmp0;
3342				alu.src[1].chan = 2;
3343
3344				alu.last = 1;
3345				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3346					return r;
3347
3348				/* fix the quotient sign (same as the sign of src0*src1) */
3349				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3350				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3351				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3352				alu.is_op3 = 1;
3353
3354				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3355
3356				alu.src[0].sel = tmp2;
3357				alu.src[0].chan = 2;
3358				alu.src[1].sel = tmp0;
3359				alu.src[1].chan = 2;
3360				alu.src[2].sel = tmp0;
3361				alu.src[2].chan = 0;
3362
3363				alu.last = 1;
3364				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3365					return r;
3366			}
3367		}
3368	}
3369	return 0;
3370}
3371
3372static int tgsi_udiv(struct r600_shader_ctx *ctx)
3373{
3374	return tgsi_divmod(ctx, 0, 0);
3375}
3376
3377static int tgsi_umod(struct r600_shader_ctx *ctx)
3378{
3379	return tgsi_divmod(ctx, 1, 0);
3380}
3381
3382static int tgsi_idiv(struct r600_shader_ctx *ctx)
3383{
3384	return tgsi_divmod(ctx, 0, 1);
3385}
3386
3387static int tgsi_imod(struct r600_shader_ctx *ctx)
3388{
3389	return tgsi_divmod(ctx, 1, 1);
3390}
3391
3392
3393static int tgsi_f2i(struct r600_shader_ctx *ctx)
3394{
3395	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3396	struct r600_bytecode_alu alu;
3397	int i, r;
3398	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3399	int last_inst = tgsi_last_instruction(write_mask);
3400
3401	for (i = 0; i < 4; i++) {
3402		if (!(write_mask & (1<<i)))
3403			continue;
3404
3405		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3406		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3407
3408		alu.dst.sel = ctx->temp_reg;
3409		alu.dst.chan = i;
3410		alu.dst.write = 1;
3411
3412		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3413		if (i == last_inst)
3414			alu.last = 1;
3415		r = r600_bytecode_add_alu(ctx->bc, &alu);
3416		if (r)
3417			return r;
3418	}
3419
3420	for (i = 0; i < 4; i++) {
3421		if (!(write_mask & (1<<i)))
3422			continue;
3423
3424		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3425		alu.inst = ctx->inst_info->r600_opcode;
3426
3427		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3428
3429		alu.src[0].sel = ctx->temp_reg;
3430		alu.src[0].chan = i;
3431
3432		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3433			alu.last = 1;
3434		r = r600_bytecode_add_alu(ctx->bc, &alu);
3435		if (r)
3436			return r;
3437	}
3438
3439	return 0;
3440}
3441
3442static int tgsi_iabs(struct r600_shader_ctx *ctx)
3443{
3444	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3445	struct r600_bytecode_alu alu;
3446	int i, r;
3447	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3448	int last_inst = tgsi_last_instruction(write_mask);
3449
3450	/* tmp = -src */
3451	for (i = 0; i < 4; i++) {
3452		if (!(write_mask & (1<<i)))
3453			continue;
3454
3455		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3456		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3457
3458		alu.dst.sel = ctx->temp_reg;
3459		alu.dst.chan = i;
3460		alu.dst.write = 1;
3461
3462		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3463		alu.src[0].sel = V_SQ_ALU_SRC_0;
3464
3465		if (i == last_inst)
3466			alu.last = 1;
3467		r = r600_bytecode_add_alu(ctx->bc, &alu);
3468		if (r)
3469			return r;
3470	}
3471
3472	/* dst = (src >= 0 ? src : tmp) */
3473	for (i = 0; i < 4; i++) {
3474		if (!(write_mask & (1<<i)))
3475			continue;
3476
3477		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3478		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3479		alu.is_op3 = 1;
3480		alu.dst.write = 1;
3481
3482		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3483
3484		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3485		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3486		alu.src[2].sel = ctx->temp_reg;
3487		alu.src[2].chan = i;
3488
3489		if (i == last_inst)
3490			alu.last = 1;
3491		r = r600_bytecode_add_alu(ctx->bc, &alu);
3492		if (r)
3493			return r;
3494	}
3495	return 0;
3496}
3497
3498static int tgsi_issg(struct r600_shader_ctx *ctx)
3499{
3500	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3501	struct r600_bytecode_alu alu;
3502	int i, r;
3503	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3504	int last_inst = tgsi_last_instruction(write_mask);
3505
3506	/* tmp = (src >= 0 ? src : -1) */
3507	for (i = 0; i < 4; i++) {
3508		if (!(write_mask & (1<<i)))
3509			continue;
3510
3511		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3512		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3513		alu.is_op3 = 1;
3514
3515		alu.dst.sel = ctx->temp_reg;
3516		alu.dst.chan = i;
3517		alu.dst.write = 1;
3518
3519		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3520		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3521		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3522
3523		if (i == last_inst)
3524			alu.last = 1;
3525		r = r600_bytecode_add_alu(ctx->bc, &alu);
3526		if (r)
3527			return r;
3528	}
3529
3530	/* dst = (tmp > 0 ? 1 : tmp) */
3531	for (i = 0; i < 4; i++) {
3532		if (!(write_mask & (1<<i)))
3533			continue;
3534
3535		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3536		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3537		alu.is_op3 = 1;
3538		alu.dst.write = 1;
3539
3540		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3541
3542		alu.src[0].sel = ctx->temp_reg;
3543		alu.src[0].chan = i;
3544
3545		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3546
3547		alu.src[2].sel = ctx->temp_reg;
3548		alu.src[2].chan = i;
3549
3550		if (i == last_inst)
3551			alu.last = 1;
3552		r = r600_bytecode_add_alu(ctx->bc, &alu);
3553		if (r)
3554			return r;
3555	}
3556	return 0;
3557}
3558
3559
3560
3561static int tgsi_ssg(struct r600_shader_ctx *ctx)
3562{
3563	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3564	struct r600_bytecode_alu alu;
3565	int i, r;
3566
3567	/* tmp = (src > 0 ? 1 : src) */
3568	for (i = 0; i < 4; i++) {
3569		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3570		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3571		alu.is_op3 = 1;
3572
3573		alu.dst.sel = ctx->temp_reg;
3574		alu.dst.chan = i;
3575
3576		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3577		alu.src[1].sel = V_SQ_ALU_SRC_1;
3578		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3579
3580		if (i == 3)
3581			alu.last = 1;
3582		r = r600_bytecode_add_alu(ctx->bc, &alu);
3583		if (r)
3584			return r;
3585	}
3586
3587	/* dst = (-tmp > 0 ? -1 : tmp) */
3588	for (i = 0; i < 4; i++) {
3589		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3590		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3591		alu.is_op3 = 1;
3592		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3593
3594		alu.src[0].sel = ctx->temp_reg;
3595		alu.src[0].chan = i;
3596		alu.src[0].neg = 1;
3597
3598		alu.src[1].sel = V_SQ_ALU_SRC_1;
3599		alu.src[1].neg = 1;
3600
3601		alu.src[2].sel = ctx->temp_reg;
3602		alu.src[2].chan = i;
3603
3604		if (i == 3)
3605			alu.last = 1;
3606		r = r600_bytecode_add_alu(ctx->bc, &alu);
3607		if (r)
3608			return r;
3609	}
3610	return 0;
3611}
3612
3613static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3614{
3615	struct r600_bytecode_alu alu;
3616	int i, r;
3617
3618	for (i = 0; i < 4; i++) {
3619		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3620		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3621			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3622			alu.dst.chan = i;
3623		} else {
3624			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3625			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3626			alu.src[0].sel = ctx->temp_reg;
3627			alu.src[0].chan = i;
3628		}
3629		if (i == 3) {
3630			alu.last = 1;
3631		}
3632		r = r600_bytecode_add_alu(ctx->bc, &alu);
3633		if (r)
3634			return r;
3635	}
3636	return 0;
3637}
3638
3639static int tgsi_op3(struct r600_shader_ctx *ctx)
3640{
3641	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3642	struct r600_bytecode_alu alu;
3643	int i, j, r;
3644	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3645
3646	for (i = 0; i < lasti + 1; i++) {
3647		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3648			continue;
3649
3650		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3651		alu.inst = ctx->inst_info->r600_opcode;
3652		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3653			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3654		}
3655
3656		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3657		alu.dst.chan = i;
3658		alu.dst.write = 1;
3659		alu.is_op3 = 1;
3660		if (i == lasti) {
3661			alu.last = 1;
3662		}
3663		r = r600_bytecode_add_alu(ctx->bc, &alu);
3664		if (r)
3665			return r;
3666	}
3667	return 0;
3668}
3669
3670static int tgsi_dp(struct r600_shader_ctx *ctx)
3671{
3672	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3673	struct r600_bytecode_alu alu;
3674	int i, j, r;
3675
3676	for (i = 0; i < 4; i++) {
3677		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3678		alu.inst = ctx->inst_info->r600_opcode;
3679		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3680			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3681		}
3682
3683		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3684		alu.dst.chan = i;
3685		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3686		/* handle some special cases */
3687		switch (ctx->inst_info->tgsi_opcode) {
3688		case TGSI_OPCODE_DP2:
3689			if (i > 1) {
3690				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3691				alu.src[0].chan = alu.src[1].chan = 0;
3692			}
3693			break;
3694		case TGSI_OPCODE_DP3:
3695			if (i > 2) {
3696				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3697				alu.src[0].chan = alu.src[1].chan = 0;
3698			}
3699			break;
3700		case TGSI_OPCODE_DPH:
3701			if (i == 3) {
3702				alu.src[0].sel = V_SQ_ALU_SRC_1;
3703				alu.src[0].chan = 0;
3704				alu.src[0].neg = 0;
3705			}
3706			break;
3707		default:
3708			break;
3709		}
3710		if (i == 3) {
3711			alu.last = 1;
3712		}
3713		r = r600_bytecode_add_alu(ctx->bc, &alu);
3714		if (r)
3715			return r;
3716	}
3717	return 0;
3718}
3719
3720static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3721						    unsigned index)
3722{
3723	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3724	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3725		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3726		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3727		ctx->src[index].neg || ctx->src[index].abs;
3728}
3729
3730static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3731					unsigned index)
3732{
3733	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3734	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3735}
3736
3737static int tgsi_tex(struct r600_shader_ctx *ctx)
3738{
3739	static float one_point_five = 1.5f;
3740	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3741	struct r600_bytecode_tex tex;
3742	struct r600_bytecode_alu alu;
3743	unsigned src_gpr;
3744	int r, i, j;
3745	int opcode;
3746	/* Texture fetch instructions can only use gprs as source.
3747	 * Also they cannot negate the source or take the absolute value */
3748	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3749                                             tgsi_tex_src_requires_loading(ctx, 0);
3750	boolean src_loaded = FALSE;
3751	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3752	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3753
3754	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3755
3756	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3757		/* get offset values */
3758		if (inst->Texture.NumOffsets) {
3759			assert(inst->Texture.NumOffsets == 1);
3760
3761			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3762			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3763			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3764		}
3765	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3766		/* TGSI moves the sampler to src reg 3 for TXD */
3767		sampler_src_reg = 3;
3768
3769		for (i = 1; i < 3; i++) {
3770			/* set gradients h/v */
3771			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3772			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3773				SQ_TEX_INST_SET_GRADIENTS_V;
3774			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3775			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3776
3777			if (tgsi_tex_src_requires_loading(ctx, i)) {
3778				tex.src_gpr = r600_get_temp(ctx);
3779				tex.src_sel_x = 0;
3780				tex.src_sel_y = 1;
3781				tex.src_sel_z = 2;
3782				tex.src_sel_w = 3;
3783
3784				for (j = 0; j < 4; j++) {
3785					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3786					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3787                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3788                                        alu.dst.sel = tex.src_gpr;
3789                                        alu.dst.chan = j;
3790                                        if (j == 3)
3791                                                alu.last = 1;
3792                                        alu.dst.write = 1;
3793                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3794                                        if (r)
3795                                                return r;
3796				}
3797
3798			} else {
3799				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3800				tex.src_sel_x = ctx->src[i].swizzle[0];
3801				tex.src_sel_y = ctx->src[i].swizzle[1];
3802				tex.src_sel_z = ctx->src[i].swizzle[2];
3803				tex.src_sel_w = ctx->src[i].swizzle[3];
3804				tex.src_rel = ctx->src[i].rel;
3805			}
3806			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3807			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3808			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3809				tex.coord_type_x = 1;
3810				tex.coord_type_y = 1;
3811				tex.coord_type_z = 1;
3812				tex.coord_type_w = 1;
3813			}
3814			r = r600_bytecode_add_tex(ctx->bc, &tex);
3815			if (r)
3816				return r;
3817		}
3818	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3819		int out_chan;
3820		/* Add perspective divide */
3821		if (ctx->bc->chip_class == CAYMAN) {
3822			out_chan = 2;
3823			for (i = 0; i < 3; i++) {
3824				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3825				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3826				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3827
3828				alu.dst.sel = ctx->temp_reg;
3829				alu.dst.chan = i;
3830				if (i == 2)
3831					alu.last = 1;
3832				if (out_chan == i)
3833					alu.dst.write = 1;
3834				r = r600_bytecode_add_alu(ctx->bc, &alu);
3835				if (r)
3836					return r;
3837			}
3838
3839		} else {
3840			out_chan = 3;
3841			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3842			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3843			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3844
3845			alu.dst.sel = ctx->temp_reg;
3846			alu.dst.chan = out_chan;
3847			alu.last = 1;
3848			alu.dst.write = 1;
3849			r = r600_bytecode_add_alu(ctx->bc, &alu);
3850			if (r)
3851				return r;
3852		}
3853
3854		for (i = 0; i < 3; i++) {
3855			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3856			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3857			alu.src[0].sel = ctx->temp_reg;
3858			alu.src[0].chan = out_chan;
3859			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3860			alu.dst.sel = ctx->temp_reg;
3861			alu.dst.chan = i;
3862			alu.dst.write = 1;
3863			r = r600_bytecode_add_alu(ctx->bc, &alu);
3864			if (r)
3865				return r;
3866		}
3867		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3868		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3869		alu.src[0].sel = V_SQ_ALU_SRC_1;
3870		alu.src[0].chan = 0;
3871		alu.dst.sel = ctx->temp_reg;
3872		alu.dst.chan = 3;
3873		alu.last = 1;
3874		alu.dst.write = 1;
3875		r = r600_bytecode_add_alu(ctx->bc, &alu);
3876		if (r)
3877			return r;
3878		src_loaded = TRUE;
3879		src_gpr = ctx->temp_reg;
3880	}
3881
3882	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3883	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3884	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3885	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3886
3887		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3888		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3889
3890		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3891		for (i = 0; i < 4; i++) {
3892			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3893			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3894			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3895			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3896			alu.dst.sel = ctx->temp_reg;
3897			alu.dst.chan = i;
3898			if (i == 3)
3899				alu.last = 1;
3900			alu.dst.write = 1;
3901			r = r600_bytecode_add_alu(ctx->bc, &alu);
3902			if (r)
3903				return r;
3904		}
3905
3906		/* tmp1.z = RCP_e(|tmp1.z|) */
3907		if (ctx->bc->chip_class == CAYMAN) {
3908			for (i = 0; i < 3; i++) {
3909				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3910				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3911				alu.src[0].sel = ctx->temp_reg;
3912				alu.src[0].chan = 2;
3913				alu.src[0].abs = 1;
3914				alu.dst.sel = ctx->temp_reg;
3915				alu.dst.chan = i;
3916				if (i == 2)
3917					alu.dst.write = 1;
3918				if (i == 2)
3919					alu.last = 1;
3920				r = r600_bytecode_add_alu(ctx->bc, &alu);
3921				if (r)
3922					return r;
3923			}
3924		} else {
3925			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3926			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3927			alu.src[0].sel = ctx->temp_reg;
3928			alu.src[0].chan = 2;
3929			alu.src[0].abs = 1;
3930			alu.dst.sel = ctx->temp_reg;
3931			alu.dst.chan = 2;
3932			alu.dst.write = 1;
3933			alu.last = 1;
3934			r = r600_bytecode_add_alu(ctx->bc, &alu);
3935			if (r)
3936				return r;
3937		}
3938
3939		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3940		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3941		 * muladd has no writemask, have to use another temp
3942		 */
3943		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3944		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3945		alu.is_op3 = 1;
3946
3947		alu.src[0].sel = ctx->temp_reg;
3948		alu.src[0].chan = 0;
3949		alu.src[1].sel = ctx->temp_reg;
3950		alu.src[1].chan = 2;
3951
3952		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3953		alu.src[2].chan = 0;
3954		alu.src[2].value = *(uint32_t *)&one_point_five;
3955
3956		alu.dst.sel = ctx->temp_reg;
3957		alu.dst.chan = 0;
3958		alu.dst.write = 1;
3959
3960		r = r600_bytecode_add_alu(ctx->bc, &alu);
3961		if (r)
3962			return r;
3963
3964		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3965		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3966		alu.is_op3 = 1;
3967
3968		alu.src[0].sel = ctx->temp_reg;
3969		alu.src[0].chan = 1;
3970		alu.src[1].sel = ctx->temp_reg;
3971		alu.src[1].chan = 2;
3972
3973		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3974		alu.src[2].chan = 0;
3975		alu.src[2].value = *(uint32_t *)&one_point_five;
3976
3977		alu.dst.sel = ctx->temp_reg;
3978		alu.dst.chan = 1;
3979		alu.dst.write = 1;
3980
3981		alu.last = 1;
3982		r = r600_bytecode_add_alu(ctx->bc, &alu);
3983		if (r)
3984			return r;
3985		/* write initial W value into Z component */
3986		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
3987			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3988			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3989			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3990			alu.dst.sel = ctx->temp_reg;
3991			alu.dst.chan = 2;
3992			alu.dst.write = 1;
3993			alu.last = 1;
3994			r = r600_bytecode_add_alu(ctx->bc, &alu);
3995			if (r)
3996				return r;
3997		}
3998		src_loaded = TRUE;
3999		src_gpr = ctx->temp_reg;
4000	}
4001
4002	if (src_requires_loading && !src_loaded) {
4003		for (i = 0; i < 4; i++) {
4004			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4005			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4006			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4007			alu.dst.sel = ctx->temp_reg;
4008			alu.dst.chan = i;
4009			if (i == 3)
4010				alu.last = 1;
4011			alu.dst.write = 1;
4012			r = r600_bytecode_add_alu(ctx->bc, &alu);
4013			if (r)
4014				return r;
4015		}
4016		src_loaded = TRUE;
4017		src_gpr = ctx->temp_reg;
4018	}
4019
4020	opcode = ctx->inst_info->r600_opcode;
4021	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4022	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4023	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4024	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4025	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4026	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4027		switch (opcode) {
4028		case SQ_TEX_INST_SAMPLE:
4029			opcode = SQ_TEX_INST_SAMPLE_C;
4030			break;
4031		case SQ_TEX_INST_SAMPLE_L:
4032			opcode = SQ_TEX_INST_SAMPLE_C_L;
4033			break;
4034		case SQ_TEX_INST_SAMPLE_LB:
4035			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4036			break;
4037		case SQ_TEX_INST_SAMPLE_G:
4038			opcode = SQ_TEX_INST_SAMPLE_C_G;
4039			break;
4040		}
4041	}
4042
4043	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4044	tex.inst = opcode;
4045
4046	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4047	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4048	tex.src_gpr = src_gpr;
4049	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4050	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4051	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4052	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4053	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4054
4055	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4056		tex.src_sel_x = 4;
4057		tex.src_sel_y = 4;
4058		tex.src_sel_z = 4;
4059		tex.src_sel_w = 4;
4060	} else if (src_loaded) {
4061		tex.src_sel_x = 0;
4062		tex.src_sel_y = 1;
4063		tex.src_sel_z = 2;
4064		tex.src_sel_w = 3;
4065	} else {
4066		tex.src_sel_x = ctx->src[0].swizzle[0];
4067		tex.src_sel_y = ctx->src[0].swizzle[1];
4068		tex.src_sel_z = ctx->src[0].swizzle[2];
4069		tex.src_sel_w = ctx->src[0].swizzle[3];
4070		tex.src_rel = ctx->src[0].rel;
4071	}
4072
4073	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4074		tex.src_sel_x = 1;
4075		tex.src_sel_y = 0;
4076		tex.src_sel_z = 3;
4077		tex.src_sel_w = 1;
4078	}
4079	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4080		tex.src_sel_x = 1;
4081		tex.src_sel_y = 0;
4082		tex.src_sel_z = 3;
4083		tex.src_sel_w = 2; /* route Z compare value into W */
4084	}
4085
4086	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4087	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4088		tex.coord_type_x = 1;
4089		tex.coord_type_y = 1;
4090	}
4091	tex.coord_type_z = 1;
4092	tex.coord_type_w = 1;
4093
4094	tex.offset_x = offset_x;
4095	tex.offset_y = offset_y;
4096	tex.offset_z = offset_z;
4097
4098	/* Put the depth for comparison in W.
4099	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4100	 * Some instructions expect the depth in Z. */
4101	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4102	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4103	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4104	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4105	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4106	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4107		tex.src_sel_w = tex.src_sel_z;
4108	}
4109
4110	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4111	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4112		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4113		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4114			/* the array index is read from Y */
4115			tex.coord_type_y = 0;
4116		} else {
4117			/* the array index is read from Z */
4118			tex.coord_type_z = 0;
4119			tex.src_sel_z = tex.src_sel_y;
4120		}
4121	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4122		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4123		/* the array index is read from Z */
4124		tex.coord_type_z = 0;
4125
4126	r = r600_bytecode_add_tex(ctx->bc, &tex);
4127	if (r)
4128		return r;
4129
4130	/* add shadow ambient support  - gallium doesn't do it yet */
4131	return 0;
4132}
4133
4134static int tgsi_lrp(struct r600_shader_ctx *ctx)
4135{
4136	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4137	struct r600_bytecode_alu alu;
4138	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4139	unsigned i;
4140	int r;
4141
4142	/* optimize if it's just an equal balance */
4143	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4144		for (i = 0; i < lasti + 1; i++) {
4145			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4146				continue;
4147
4148			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4149			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4150			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4151			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4152			alu.omod = 3;
4153			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4154			alu.dst.chan = i;
4155			if (i == lasti) {
4156				alu.last = 1;
4157			}
4158			r = r600_bytecode_add_alu(ctx->bc, &alu);
4159			if (r)
4160				return r;
4161		}
4162		return 0;
4163	}
4164
4165	/* 1 - src0 */
4166	for (i = 0; i < lasti + 1; i++) {
4167		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4168			continue;
4169
4170		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4171		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4172		alu.src[0].sel = V_SQ_ALU_SRC_1;
4173		alu.src[0].chan = 0;
4174		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4175		r600_bytecode_src_toggle_neg(&alu.src[1]);
4176		alu.dst.sel = ctx->temp_reg;
4177		alu.dst.chan = i;
4178		if (i == lasti) {
4179			alu.last = 1;
4180		}
4181		alu.dst.write = 1;
4182		r = r600_bytecode_add_alu(ctx->bc, &alu);
4183		if (r)
4184			return r;
4185	}
4186
4187	/* (1 - src0) * src2 */
4188	for (i = 0; i < lasti + 1; i++) {
4189		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4190			continue;
4191
4192		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4193		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4194		alu.src[0].sel = ctx->temp_reg;
4195		alu.src[0].chan = i;
4196		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4197		alu.dst.sel = ctx->temp_reg;
4198		alu.dst.chan = i;
4199		if (i == lasti) {
4200			alu.last = 1;
4201		}
4202		alu.dst.write = 1;
4203		r = r600_bytecode_add_alu(ctx->bc, &alu);
4204		if (r)
4205			return r;
4206	}
4207
4208	/* src0 * src1 + (1 - src0) * src2 */
4209	for (i = 0; i < lasti + 1; i++) {
4210		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4211			continue;
4212
4213		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4214		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4215		alu.is_op3 = 1;
4216		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4217		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4218		alu.src[2].sel = ctx->temp_reg;
4219		alu.src[2].chan = i;
4220
4221		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4222		alu.dst.chan = i;
4223		if (i == lasti) {
4224			alu.last = 1;
4225		}
4226		r = r600_bytecode_add_alu(ctx->bc, &alu);
4227		if (r)
4228			return r;
4229	}
4230	return 0;
4231}
4232
4233static int tgsi_cmp(struct r600_shader_ctx *ctx)
4234{
4235	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4236	struct r600_bytecode_alu alu;
4237	int i, r;
4238	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4239
4240	for (i = 0; i < lasti + 1; i++) {
4241		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4242			continue;
4243
4244		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4245		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4246		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4247		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4248		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4249		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4250		alu.dst.chan = i;
4251		alu.dst.write = 1;
4252		alu.is_op3 = 1;
4253		if (i == lasti)
4254			alu.last = 1;
4255		r = r600_bytecode_add_alu(ctx->bc, &alu);
4256		if (r)
4257			return r;
4258	}
4259	return 0;
4260}
4261
4262static int tgsi_xpd(struct r600_shader_ctx *ctx)
4263{
4264	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4265	static const unsigned int src0_swizzle[] = {2, 0, 1};
4266	static const unsigned int src1_swizzle[] = {1, 2, 0};
4267	struct r600_bytecode_alu alu;
4268	uint32_t use_temp = 0;
4269	int i, r;
4270
4271	if (inst->Dst[0].Register.WriteMask != 0xf)
4272		use_temp = 1;
4273
4274	for (i = 0; i < 4; i++) {
4275		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4276		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4277		if (i < 3) {
4278			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4279			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4280		} else {
4281			alu.src[0].sel = V_SQ_ALU_SRC_0;
4282			alu.src[0].chan = i;
4283			alu.src[1].sel = V_SQ_ALU_SRC_0;
4284			alu.src[1].chan = i;
4285		}
4286
4287		alu.dst.sel = ctx->temp_reg;
4288		alu.dst.chan = i;
4289		alu.dst.write = 1;
4290
4291		if (i == 3)
4292			alu.last = 1;
4293		r = r600_bytecode_add_alu(ctx->bc, &alu);
4294		if (r)
4295			return r;
4296	}
4297
4298	for (i = 0; i < 4; i++) {
4299		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4300		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4301
4302		if (i < 3) {
4303			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4304			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4305		} else {
4306			alu.src[0].sel = V_SQ_ALU_SRC_0;
4307			alu.src[0].chan = i;
4308			alu.src[1].sel = V_SQ_ALU_SRC_0;
4309			alu.src[1].chan = i;
4310		}
4311
4312		alu.src[2].sel = ctx->temp_reg;
4313		alu.src[2].neg = 1;
4314		alu.src[2].chan = i;
4315
4316		if (use_temp)
4317			alu.dst.sel = ctx->temp_reg;
4318		else
4319			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4320		alu.dst.chan = i;
4321		alu.dst.write = 1;
4322		alu.is_op3 = 1;
4323		if (i == 3)
4324			alu.last = 1;
4325		r = r600_bytecode_add_alu(ctx->bc, &alu);
4326		if (r)
4327			return r;
4328	}
4329	if (use_temp)
4330		return tgsi_helper_copy(ctx, inst);
4331	return 0;
4332}
4333
4334static int tgsi_exp(struct r600_shader_ctx *ctx)
4335{
4336	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4337	struct r600_bytecode_alu alu;
4338	int r;
4339	int i;
4340
4341	/* result.x = 2^floor(src); */
4342	if (inst->Dst[0].Register.WriteMask & 1) {
4343		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4344
4345		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4346		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4347
4348		alu.dst.sel = ctx->temp_reg;
4349		alu.dst.chan = 0;
4350		alu.dst.write = 1;
4351		alu.last = 1;
4352		r = r600_bytecode_add_alu(ctx->bc, &alu);
4353		if (r)
4354			return r;
4355
4356		if (ctx->bc->chip_class == CAYMAN) {
4357			for (i = 0; i < 3; i++) {
4358				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4359				alu.src[0].sel = ctx->temp_reg;
4360				alu.src[0].chan = 0;
4361
4362				alu.dst.sel = ctx->temp_reg;
4363				alu.dst.chan = i;
4364				if (i == 0)
4365					alu.dst.write = 1;
4366				if (i == 2)
4367					alu.last = 1;
4368				r = r600_bytecode_add_alu(ctx->bc, &alu);
4369				if (r)
4370					return r;
4371			}
4372		} else {
4373			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4374			alu.src[0].sel = ctx->temp_reg;
4375			alu.src[0].chan = 0;
4376
4377			alu.dst.sel = ctx->temp_reg;
4378			alu.dst.chan = 0;
4379			alu.dst.write = 1;
4380			alu.last = 1;
4381			r = r600_bytecode_add_alu(ctx->bc, &alu);
4382			if (r)
4383				return r;
4384		}
4385	}
4386
4387	/* result.y = tmp - floor(tmp); */
4388	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4389		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4390
4391		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4392		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4393
4394		alu.dst.sel = ctx->temp_reg;
4395#if 0
4396		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4397		if (r)
4398			return r;
4399#endif
4400		alu.dst.write = 1;
4401		alu.dst.chan = 1;
4402
4403		alu.last = 1;
4404
4405		r = r600_bytecode_add_alu(ctx->bc, &alu);
4406		if (r)
4407			return r;
4408	}
4409
4410	/* result.z = RoughApprox2ToX(tmp);*/
4411	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4412		if (ctx->bc->chip_class == CAYMAN) {
4413			for (i = 0; i < 3; i++) {
4414				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4415				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4416				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4417
4418				alu.dst.sel = ctx->temp_reg;
4419				alu.dst.chan = i;
4420				if (i == 2) {
4421					alu.dst.write = 1;
4422					alu.last = 1;
4423				}
4424
4425				r = r600_bytecode_add_alu(ctx->bc, &alu);
4426				if (r)
4427					return r;
4428			}
4429		} else {
4430			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4431			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4432			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4433
4434			alu.dst.sel = ctx->temp_reg;
4435			alu.dst.write = 1;
4436			alu.dst.chan = 2;
4437
4438			alu.last = 1;
4439
4440			r = r600_bytecode_add_alu(ctx->bc, &alu);
4441			if (r)
4442				return r;
4443		}
4444	}
4445
4446	/* result.w = 1.0;*/
4447	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4448		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4449
4450		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4451		alu.src[0].sel = V_SQ_ALU_SRC_1;
4452		alu.src[0].chan = 0;
4453
4454		alu.dst.sel = ctx->temp_reg;
4455		alu.dst.chan = 3;
4456		alu.dst.write = 1;
4457		alu.last = 1;
4458		r = r600_bytecode_add_alu(ctx->bc, &alu);
4459		if (r)
4460			return r;
4461	}
4462	return tgsi_helper_copy(ctx, inst);
4463}
4464
4465static int tgsi_log(struct r600_shader_ctx *ctx)
4466{
4467	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4468	struct r600_bytecode_alu alu;
4469	int r;
4470	int i;
4471
4472	/* result.x = floor(log2(|src|)); */
4473	if (inst->Dst[0].Register.WriteMask & 1) {
4474		if (ctx->bc->chip_class == CAYMAN) {
4475			for (i = 0; i < 3; i++) {
4476				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4477
4478				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4479				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4480				r600_bytecode_src_set_abs(&alu.src[0]);
4481
4482				alu.dst.sel = ctx->temp_reg;
4483				alu.dst.chan = i;
4484				if (i == 0)
4485					alu.dst.write = 1;
4486				if (i == 2)
4487					alu.last = 1;
4488				r = r600_bytecode_add_alu(ctx->bc, &alu);
4489				if (r)
4490					return r;
4491			}
4492
4493		} else {
4494			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4495
4496			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4497			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4498			r600_bytecode_src_set_abs(&alu.src[0]);
4499
4500			alu.dst.sel = ctx->temp_reg;
4501			alu.dst.chan = 0;
4502			alu.dst.write = 1;
4503			alu.last = 1;
4504			r = r600_bytecode_add_alu(ctx->bc, &alu);
4505			if (r)
4506				return r;
4507		}
4508
4509		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4510		alu.src[0].sel = ctx->temp_reg;
4511		alu.src[0].chan = 0;
4512
4513		alu.dst.sel = ctx->temp_reg;
4514		alu.dst.chan = 0;
4515		alu.dst.write = 1;
4516		alu.last = 1;
4517
4518		r = r600_bytecode_add_alu(ctx->bc, &alu);
4519		if (r)
4520			return r;
4521	}
4522
4523	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4524	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4525
4526		if (ctx->bc->chip_class == CAYMAN) {
4527			for (i = 0; i < 3; i++) {
4528				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4529
4530				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4531				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4532				r600_bytecode_src_set_abs(&alu.src[0]);
4533
4534				alu.dst.sel = ctx->temp_reg;
4535				alu.dst.chan = i;
4536				if (i == 1)
4537					alu.dst.write = 1;
4538				if (i == 2)
4539					alu.last = 1;
4540
4541				r = r600_bytecode_add_alu(ctx->bc, &alu);
4542				if (r)
4543					return r;
4544			}
4545		} else {
4546			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4547
4548			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4549			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4550			r600_bytecode_src_set_abs(&alu.src[0]);
4551
4552			alu.dst.sel = ctx->temp_reg;
4553			alu.dst.chan = 1;
4554			alu.dst.write = 1;
4555			alu.last = 1;
4556
4557			r = r600_bytecode_add_alu(ctx->bc, &alu);
4558			if (r)
4559				return r;
4560		}
4561
4562		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4563
4564		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4565		alu.src[0].sel = ctx->temp_reg;
4566		alu.src[0].chan = 1;
4567
4568		alu.dst.sel = ctx->temp_reg;
4569		alu.dst.chan = 1;
4570		alu.dst.write = 1;
4571		alu.last = 1;
4572
4573		r = r600_bytecode_add_alu(ctx->bc, &alu);
4574		if (r)
4575			return r;
4576
4577		if (ctx->bc->chip_class == CAYMAN) {
4578			for (i = 0; i < 3; i++) {
4579				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4580				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4581				alu.src[0].sel = ctx->temp_reg;
4582				alu.src[0].chan = 1;
4583
4584				alu.dst.sel = ctx->temp_reg;
4585				alu.dst.chan = i;
4586				if (i == 1)
4587					alu.dst.write = 1;
4588				if (i == 2)
4589					alu.last = 1;
4590
4591				r = r600_bytecode_add_alu(ctx->bc, &alu);
4592				if (r)
4593					return r;
4594			}
4595		} else {
4596			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4597			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4598			alu.src[0].sel = ctx->temp_reg;
4599			alu.src[0].chan = 1;
4600
4601			alu.dst.sel = ctx->temp_reg;
4602			alu.dst.chan = 1;
4603			alu.dst.write = 1;
4604			alu.last = 1;
4605
4606			r = r600_bytecode_add_alu(ctx->bc, &alu);
4607			if (r)
4608				return r;
4609		}
4610
4611		if (ctx->bc->chip_class == CAYMAN) {
4612			for (i = 0; i < 3; i++) {
4613				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4614				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4615				alu.src[0].sel = ctx->temp_reg;
4616				alu.src[0].chan = 1;
4617
4618				alu.dst.sel = ctx->temp_reg;
4619				alu.dst.chan = i;
4620				if (i == 1)
4621					alu.dst.write = 1;
4622				if (i == 2)
4623					alu.last = 1;
4624
4625				r = r600_bytecode_add_alu(ctx->bc, &alu);
4626				if (r)
4627					return r;
4628			}
4629		} else {
4630			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4632			alu.src[0].sel = ctx->temp_reg;
4633			alu.src[0].chan = 1;
4634
4635			alu.dst.sel = ctx->temp_reg;
4636			alu.dst.chan = 1;
4637			alu.dst.write = 1;
4638			alu.last = 1;
4639
4640			r = r600_bytecode_add_alu(ctx->bc, &alu);
4641			if (r)
4642				return r;
4643		}
4644
4645		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4646
4647		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4648
4649		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4650		r600_bytecode_src_set_abs(&alu.src[0]);
4651
4652		alu.src[1].sel = ctx->temp_reg;
4653		alu.src[1].chan = 1;
4654
4655		alu.dst.sel = ctx->temp_reg;
4656		alu.dst.chan = 1;
4657		alu.dst.write = 1;
4658		alu.last = 1;
4659
4660		r = r600_bytecode_add_alu(ctx->bc, &alu);
4661		if (r)
4662			return r;
4663	}
4664
4665	/* result.z = log2(|src|);*/
4666	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4667		if (ctx->bc->chip_class == CAYMAN) {
4668			for (i = 0; i < 3; i++) {
4669				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4670
4671				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4672				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4673				r600_bytecode_src_set_abs(&alu.src[0]);
4674
4675				alu.dst.sel = ctx->temp_reg;
4676				if (i == 2)
4677					alu.dst.write = 1;
4678				alu.dst.chan = i;
4679				if (i == 2)
4680					alu.last = 1;
4681
4682				r = r600_bytecode_add_alu(ctx->bc, &alu);
4683				if (r)
4684					return r;
4685			}
4686		} else {
4687			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4688
4689			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4690			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4691			r600_bytecode_src_set_abs(&alu.src[0]);
4692
4693			alu.dst.sel = ctx->temp_reg;
4694			alu.dst.write = 1;
4695			alu.dst.chan = 2;
4696			alu.last = 1;
4697
4698			r = r600_bytecode_add_alu(ctx->bc, &alu);
4699			if (r)
4700				return r;
4701		}
4702	}
4703
4704	/* result.w = 1.0; */
4705	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4706		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4707
4708		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4709		alu.src[0].sel = V_SQ_ALU_SRC_1;
4710		alu.src[0].chan = 0;
4711
4712		alu.dst.sel = ctx->temp_reg;
4713		alu.dst.chan = 3;
4714		alu.dst.write = 1;
4715		alu.last = 1;
4716
4717		r = r600_bytecode_add_alu(ctx->bc, &alu);
4718		if (r)
4719			return r;
4720	}
4721
4722	return tgsi_helper_copy(ctx, inst);
4723}
4724
4725static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4726{
4727	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4728	struct r600_bytecode_alu alu;
4729	int r;
4730
4731	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4732
4733	switch (inst->Instruction.Opcode) {
4734	case TGSI_OPCODE_ARL:
4735		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4736		break;
4737	case TGSI_OPCODE_ARR:
4738		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4739		break;
4740	case TGSI_OPCODE_UARL:
4741		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4742		break;
4743	default:
4744		assert(0);
4745		return -1;
4746	}
4747
4748	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4749	alu.last = 1;
4750	alu.dst.sel = ctx->bc->ar_reg;
4751	alu.dst.write = 1;
4752	r = r600_bytecode_add_alu(ctx->bc, &alu);
4753	if (r)
4754		return r;
4755
4756	ctx->bc->ar_loaded = 0;
4757	return 0;
4758}
4759static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4760{
4761	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4762	struct r600_bytecode_alu alu;
4763	int r;
4764
4765	switch (inst->Instruction.Opcode) {
4766	case TGSI_OPCODE_ARL:
4767		memset(&alu, 0, sizeof(alu));
4768		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4769		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4770		alu.dst.sel = ctx->bc->ar_reg;
4771		alu.dst.write = 1;
4772		alu.last = 1;
4773
4774		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4775			return r;
4776
4777		memset(&alu, 0, sizeof(alu));
4778		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4779		alu.src[0].sel = ctx->bc->ar_reg;
4780		alu.dst.sel = ctx->bc->ar_reg;
4781		alu.dst.write = 1;
4782		alu.last = 1;
4783
4784		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4785			return r;
4786		break;
4787	case TGSI_OPCODE_ARR:
4788		memset(&alu, 0, sizeof(alu));
4789		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4790		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4791		alu.dst.sel = ctx->bc->ar_reg;
4792		alu.dst.write = 1;
4793		alu.last = 1;
4794
4795		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4796			return r;
4797		break;
4798	case TGSI_OPCODE_UARL:
4799		memset(&alu, 0, sizeof(alu));
4800		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4801		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4802		alu.dst.sel = ctx->bc->ar_reg;
4803		alu.dst.write = 1;
4804		alu.last = 1;
4805
4806		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4807			return r;
4808		break;
4809	default:
4810		assert(0);
4811		return -1;
4812	}
4813
4814	ctx->bc->ar_loaded = 0;
4815	return 0;
4816}
4817
4818static int tgsi_opdst(struct r600_shader_ctx *ctx)
4819{
4820	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4821	struct r600_bytecode_alu alu;
4822	int i, r = 0;
4823
4824	for (i = 0; i < 4; i++) {
4825		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4826
4827		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4828		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4829
4830		if (i == 0 || i == 3) {
4831			alu.src[0].sel = V_SQ_ALU_SRC_1;
4832		} else {
4833			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4834		}
4835
4836		if (i == 0 || i == 2) {
4837			alu.src[1].sel = V_SQ_ALU_SRC_1;
4838		} else {
4839			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4840		}
4841		if (i == 3)
4842			alu.last = 1;
4843		r = r600_bytecode_add_alu(ctx->bc, &alu);
4844		if (r)
4845			return r;
4846	}
4847	return 0;
4848}
4849
4850static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4851{
4852	struct r600_bytecode_alu alu;
4853	int r;
4854
4855	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856	alu.inst = opcode;
4857	alu.predicate = 1;
4858
4859	alu.dst.sel = ctx->temp_reg;
4860	alu.dst.write = 1;
4861	alu.dst.chan = 0;
4862
4863	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4864	alu.src[1].sel = V_SQ_ALU_SRC_0;
4865	alu.src[1].chan = 0;
4866
4867	alu.last = 1;
4868
4869	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4870	if (r)
4871		return r;
4872	return 0;
4873}
4874
4875static int pops(struct r600_shader_ctx *ctx, int pops)
4876{
4877	unsigned force_pop = ctx->bc->force_add_cf;
4878
4879	if (!force_pop) {
4880		int alu_pop = 3;
4881		if (ctx->bc->cf_last) {
4882			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4883				alu_pop = 0;
4884			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4885				alu_pop = 1;
4886		}
4887		alu_pop += pops;
4888		if (alu_pop == 1) {
4889			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4890			ctx->bc->force_add_cf = 1;
4891		} else if (alu_pop == 2) {
4892			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4893			ctx->bc->force_add_cf = 1;
4894		} else {
4895			force_pop = 1;
4896		}
4897	}
4898
4899	if (force_pop) {
4900		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4901		ctx->bc->cf_last->pop_count = pops;
4902		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4903	}
4904
4905	return 0;
4906}
4907
4908static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4909{
4910	switch(reason) {
4911	case FC_PUSH_VPM:
4912		ctx->bc->callstack[ctx->bc->call_sp].current--;
4913		break;
4914	case FC_PUSH_WQM:
4915	case FC_LOOP:
4916		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4917		break;
4918	case FC_REP:
4919		/* TOODO : for 16 vp asic should -= 2; */
4920		ctx->bc->callstack[ctx->bc->call_sp].current --;
4921		break;
4922	}
4923}
4924
4925static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4926{
4927	if (check_max_only) {
4928		int diff;
4929		switch (reason) {
4930		case FC_PUSH_VPM:
4931			diff = 1;
4932			break;
4933		case FC_PUSH_WQM:
4934			diff = 4;
4935			break;
4936		default:
4937			assert(0);
4938			diff = 0;
4939		}
4940		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4941		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4942			ctx->bc->callstack[ctx->bc->call_sp].max =
4943				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4944		}
4945		return;
4946	}
4947	switch (reason) {
4948	case FC_PUSH_VPM:
4949		ctx->bc->callstack[ctx->bc->call_sp].current++;
4950		break;
4951	case FC_PUSH_WQM:
4952	case FC_LOOP:
4953		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
4954		break;
4955	case FC_REP:
4956		ctx->bc->callstack[ctx->bc->call_sp].current++;
4957		break;
4958	}
4959
4960	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
4961	    ctx->bc->callstack[ctx->bc->call_sp].max) {
4962		ctx->bc->callstack[ctx->bc->call_sp].max =
4963			ctx->bc->callstack[ctx->bc->call_sp].current;
4964	}
4965}
4966
4967static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
4968{
4969	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
4970
4971	sp->mid = (struct r600_bytecode_cf **)realloc((void *)sp->mid,
4972						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
4973	sp->mid[sp->num_mid] = ctx->bc->cf_last;
4974	sp->num_mid++;
4975}
4976
4977static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
4978{
4979	ctx->bc->fc_sp++;
4980	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
4981	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
4982}
4983
4984static void fc_poplevel(struct r600_shader_ctx *ctx)
4985{
4986	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
4987	if (sp->mid) {
4988		free(sp->mid);
4989		sp->mid = NULL;
4990	}
4991	sp->num_mid = 0;
4992	sp->start = NULL;
4993	sp->type = 0;
4994	ctx->bc->fc_sp--;
4995}
4996
4997#if 0
4998static int emit_return(struct r600_shader_ctx *ctx)
4999{
5000	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5001	return 0;
5002}
5003
5004static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5005{
5006
5007	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5008	ctx->bc->cf_last->pop_count = pops;
5009	/* XXX work out offset */
5010	return 0;
5011}
5012
5013static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5014{
5015	return 0;
5016}
5017
5018static void emit_testflag(struct r600_shader_ctx *ctx)
5019{
5020
5021}
5022
5023static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5024{
5025	emit_testflag(ctx);
5026	emit_jump_to_offset(ctx, 1, 4);
5027	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5028	pops(ctx, ifidx + 1);
5029	emit_return(ctx);
5030}
5031
5032static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5033{
5034	emit_testflag(ctx);
5035
5036	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5037	ctx->bc->cf_last->pop_count = 1;
5038
5039	fc_set_mid(ctx, fc_sp);
5040
5041	pops(ctx, 1);
5042}
5043#endif
5044
5045static int tgsi_if(struct r600_shader_ctx *ctx)
5046{
5047	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5048
5049	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5050
5051	fc_pushlevel(ctx, FC_IF);
5052
5053	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5054	return 0;
5055}
5056
5057static int tgsi_else(struct r600_shader_ctx *ctx)
5058{
5059	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5060	ctx->bc->cf_last->pop_count = 1;
5061
5062	fc_set_mid(ctx, ctx->bc->fc_sp);
5063	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5064	return 0;
5065}
5066
5067static int tgsi_endif(struct r600_shader_ctx *ctx)
5068{
5069	pops(ctx, 1);
5070	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5071		R600_ERR("if/endif unbalanced in shader\n");
5072		return -1;
5073	}
5074
5075	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5076		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5077		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5078	} else {
5079		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5080	}
5081	fc_poplevel(ctx);
5082
5083	callstack_decrease_current(ctx, FC_PUSH_VPM);
5084	return 0;
5085}
5086
5087static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5088{
5089	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL));
5090
5091	fc_pushlevel(ctx, FC_LOOP);
5092
5093	/* check stack depth */
5094	callstack_check_depth(ctx, FC_LOOP, 0);
5095	return 0;
5096}
5097
5098static int tgsi_endloop(struct r600_shader_ctx *ctx)
5099{
5100	int i;
5101
5102	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5103
5104	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5105		R600_ERR("loop/endloop in shader code are not paired.\n");
5106		return -EINVAL;
5107	}
5108
5109	/* fixup loop pointers - from r600isa
5110	   LOOP END points to CF after LOOP START,
5111	   LOOP START point to CF after LOOP END
5112	   BRK/CONT point to LOOP END CF
5113	*/
5114	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5115
5116	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5117
5118	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5119		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5120	}
5121	/* XXX add LOOPRET support */
5122	fc_poplevel(ctx);
5123	callstack_decrease_current(ctx, FC_LOOP);
5124	return 0;
5125}
5126
5127static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5128{
5129	unsigned int fscp;
5130
5131	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5132	{
5133		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5134			break;
5135	}
5136
5137	if (fscp == 0) {
5138		R600_ERR("Break not inside loop/endloop pair\n");
5139		return -EINVAL;
5140	}
5141
5142	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5143
5144	fc_set_mid(ctx, fscp);
5145
5146	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5147	return 0;
5148}
5149
5150static int tgsi_umad(struct r600_shader_ctx *ctx)
5151{
5152	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5153	struct r600_bytecode_alu alu;
5154	int i, j, r;
5155	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5156
5157	/* src0 * src1 */
5158	for (i = 0; i < lasti + 1; i++) {
5159		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5160			continue;
5161
5162		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5163
5164		alu.dst.chan = i;
5165		alu.dst.sel = ctx->temp_reg;
5166		alu.dst.write = 1;
5167
5168		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5169		for (j = 0; j < 2; j++) {
5170		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5171		}
5172
5173		alu.last = 1;
5174		r = r600_bytecode_add_alu(ctx->bc, &alu);
5175		if (r)
5176			return r;
5177	}
5178
5179
5180	for (i = 0; i < lasti + 1; i++) {
5181		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5182			continue;
5183
5184		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5185		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5186
5187		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5188
5189		alu.src[0].sel = ctx->temp_reg;
5190		alu.src[0].chan = i;
5191
5192		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5193		if (i == lasti) {
5194			alu.last = 1;
5195		}
5196		r = r600_bytecode_add_alu(ctx->bc, &alu);
5197		if (r)
5198			return r;
5199	}
5200	return 0;
5201}
5202
5203static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5204	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5205	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5206	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5207
5208	/* XXX:
5209	 * For state trackers other than OpenGL, we'll want to use
5210	 * _RECIP_IEEE instead.
5211	 */
5212	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5213
5214	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5215	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5216	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5217	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5218	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5219	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5220	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5221	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5222	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5223	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5224	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5225	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5226	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5227	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5228	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5229	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5230	/* gap */
5231	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5232	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5233	/* gap */
5234	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5235	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5236	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5237	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5238	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5239	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5240	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5241	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5242	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5243	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5244	/* gap */
5245	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5246	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5247	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5248	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5249	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5250	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5251	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5252	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5253	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5254	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5255	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5256	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5257	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5258	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5259	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5260	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5261	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5262	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5263	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5264	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5265	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5266	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5267	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5268	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5269	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5270	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5271	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5272	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5273	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5274	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5275	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5276	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5277	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5278	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5279	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5280	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5281	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5282	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5283	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5284	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5285	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5286	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5287	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5288	/* gap */
5289	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5290	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5291	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5292	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5293	/* gap */
5294	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5295	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5296	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5298	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5299	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5300	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5301	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5302	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5303	/* gap */
5304	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5305	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5306	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5307	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5308	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5309	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5310	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5311	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5312	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5313	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5314	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5315	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5316	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5317	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5318	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5319	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5320	/* gap */
5321	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5322	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5323	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5324	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5325	/* gap */
5326	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5327	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5328	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5329	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5330	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5331	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5332	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5333	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5334	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5335	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5336	/* gap */
5337	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5338	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5339	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5340	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5341	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5342	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5343	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5344	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5345	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5346	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5347	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5348	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5349	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5350	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5351	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5352	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5353	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5354	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5355	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5356	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5357	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5358	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5359	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5360	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5361	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5362	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5363	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5364	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5365	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5366	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5367	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5368	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5369	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5370	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5371	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5372	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5373	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5374	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5375	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5376	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5377	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5378	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5379	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5380	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5381};
5382
5383static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5384	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5385	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5386	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5387	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5388	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5389	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5390	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5391	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5392	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5393	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5394	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5395	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5396	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5397	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5398	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5399	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5400	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5401	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5402	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5403	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5404	/* gap */
5405	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5406	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5407	/* gap */
5408	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5409	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5410	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5411	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5412	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5413	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5414	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5415	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5416	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5417	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5418	/* gap */
5419	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5420	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5421	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5422	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5423	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5424	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5425	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5426	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5427	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5428	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5429	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5430	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5431	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5432	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5433	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5434	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5435	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5436	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5437	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5438	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5439	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5440	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5441	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5442	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5443	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5444	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5445	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5446	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5447	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5448	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5449	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5450	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5451	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5452	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5453	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5454	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5455	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5456	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5457	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5458	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5459	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5460	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5461	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5462	/* gap */
5463	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5464	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5465	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5466	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5467	/* gap */
5468	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5469	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5470	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5472	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5473	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5474	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5475	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5476	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5477	/* gap */
5478	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5479	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5480	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5481	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5482	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5483	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5484	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5485	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5486	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5487	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5488	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5489	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5490	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5491	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5492	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5493	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5494	/* gap */
5495	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5496	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5498	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5499	/* gap */
5500	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5502	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5503	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5504	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5505	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5506	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5507	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5508	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5509	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5510	/* gap */
5511	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5512	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5513	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5514	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5515	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5516	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5517	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5518	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5519	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5520	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5521	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5522	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5523	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5524	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5525	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5526	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5527	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5528	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5529	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5530	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5531	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5532	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5533	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5534	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5535	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5536	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5537	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5538	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5539	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5540	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5541	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5542	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5543	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5544	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5545	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5546	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5547	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5548	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5549	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5550	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5551	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5552	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5553	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5554	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5555};
5556
5557static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5558	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5559	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5560	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5561	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5562	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5563	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5564	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5565	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5566	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5567	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5568	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5569	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5570	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5571	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5572	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5573	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5574	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5575	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5576	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5577	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5578	/* gap */
5579	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5580	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5581	/* gap */
5582	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5583	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5584	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5585	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5586	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5587	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5588	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5589	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5590	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5591	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5592	/* gap */
5593	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5594	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5595	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5596	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5597	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5598	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5599	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5600	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5601	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5602	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5603	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5604	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5605	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5606	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5607	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5608	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5609	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5610	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5611	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5612	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5613	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5614	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5615	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5616	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5617	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5618	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5619	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5620	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5621	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5622	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5623	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5624	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5625	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5626	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5627	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5628	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5629	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5630	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5631	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5632	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5633	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5634	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5635	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5636	/* gap */
5637	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5638	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5639	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5640	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5641	/* gap */
5642	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5643	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5644	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5646	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5647	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5648	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5649	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5650	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5651	/* gap */
5652	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5653	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5654	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5655	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5656	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5657	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5658	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5659	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5660	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5661	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5662	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5663	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5664	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5665	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5666	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5667	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5668	/* gap */
5669	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5670	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5672	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5673	/* gap */
5674	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5676	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5677	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5678	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5679	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5680	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5681	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5682	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5683	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5684	/* gap */
5685	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5686	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5687	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5688	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5689	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5690	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5691	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5692	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5693	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5694	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5695	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5696	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5697	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5698	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5699	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5700	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5701	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5702	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5703	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5704	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5705	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5706	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5707	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5708	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5709	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5710	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5711	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5712	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5713	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5714	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5715	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5716	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5717	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5718	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5719	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5720	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5721	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5722	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5723	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5724	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5725	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5726	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5727	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5728	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5729};
5730