r600_shader.c revision d0a9ab29b2c8abf2900b1095883cba71b05b5cd9
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader);
107
108int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader)
109{
110	static int dump_shaders = -1;
111	struct r600_context *rctx = (struct r600_context *)ctx;
112	struct r600_pipe_shader_selector *sel = shader->selector;
113	int r;
114
115	/* Would like some magic "get_bool_option_once" routine.
116	*/
117	if (dump_shaders == -1)
118		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
119
120	if (dump_shaders) {
121		fprintf(stderr, "--------------------------------------------------------------\n");
122		tgsi_dump(sel->tokens, 0);
123
124		if (sel->so.num_outputs) {
125			unsigned i;
126			fprintf(stderr, "STREAMOUT\n");
127			for (i = 0; i < sel->so.num_outputs; i++) {
128				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
129						sel->so.output[i].start_component;
130				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
131					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
132				        mask & 1 ? "x" : "_",
133				        (mask >> 1) & 1 ? "y" : "_",
134				        (mask >> 2) & 1 ? "z" : "_",
135				        (mask >> 3) & 1 ? "w" : "_");
136			}
137		}
138	}
139	r = r600_shader_from_tgsi(rctx, shader);
140	if (r) {
141		R600_ERR("translation from TGSI failed !\n");
142		return r;
143	}
144	r = r600_bytecode_build(&shader->shader.bc);
145	if (r) {
146		R600_ERR("building bytecode failed !\n");
147		return r;
148	}
149	if (dump_shaders) {
150		r600_bytecode_dump(&shader->shader.bc);
151		fprintf(stderr, "______________________________________________________________\n");
152	}
153	return r600_pipe_shader(ctx, shader);
154}
155
156void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
157{
158	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
159	r600_bytecode_clear(&shader->shader.bc);
160}
161
162/*
163 * tgsi -> r600 shader
164 */
165struct r600_shader_tgsi_instruction;
166
167struct r600_shader_src {
168	unsigned				sel;
169	unsigned				swizzle[4];
170	unsigned				neg;
171	unsigned				abs;
172	unsigned				rel;
173	uint32_t				value[4];
174};
175
176struct r600_shader_ctx {
177	struct tgsi_shader_info			info;
178	struct tgsi_parse_context		parse;
179	const struct tgsi_token			*tokens;
180	unsigned				type;
181	unsigned				file_offset[TGSI_FILE_COUNT];
182	unsigned				temp_reg;
183	struct r600_shader_tgsi_instruction	*inst_info;
184	struct r600_bytecode			*bc;
185	struct r600_shader			*shader;
186	struct r600_shader_src			src[4];
187	uint32_t				*literals;
188	uint32_t				nliterals;
189	uint32_t				max_driver_temp_used;
190	/* needed for evergreen interpolation */
191	boolean                                 input_centroid;
192	boolean                                 input_linear;
193	boolean                                 input_perspective;
194	int					num_interp_gpr;
195	int					face_gpr;
196	int					colors_used;
197	boolean                 clip_vertex_write;
198	unsigned                cv_output;
199	int					fragcoord_input;
200	int					native_integers;
201};
202
203struct r600_shader_tgsi_instruction {
204	unsigned	tgsi_opcode;
205	unsigned	is_op3;
206	unsigned	r600_opcode;
207	int (*process)(struct r600_shader_ctx *ctx);
208};
209
210static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
211static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
212static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
213static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
214static int tgsi_else(struct r600_shader_ctx *ctx);
215static int tgsi_endif(struct r600_shader_ctx *ctx);
216static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
217static int tgsi_endloop(struct r600_shader_ctx *ctx);
218static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
219
220/*
221 * bytestream -> r600 shader
222 *
223 * These functions are used to transform the output of the LLVM backend into
224 * struct r600_bytecode.
225 */
226
227static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
228				unsigned char * bytes,	unsigned num_bytes);
229
230#ifdef HAVE_OPENCL
231int r600_compute_shader_create(struct pipe_context * ctx,
232	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
233{
234	struct r600_context *r600_ctx = (struct r600_context *)ctx;
235	unsigned char * bytes;
236	unsigned byte_count;
237	struct r600_shader_ctx shader_ctx;
238	unsigned dump = 0;
239
240	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
241		dump = 1;
242	}
243
244	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
245	shader_ctx.bc = bytecode;
246	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
247	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
248	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
249	if (shader_ctx.bc->chip_class == CAYMAN) {
250		cm_bytecode_add_cf_end(shader_ctx.bc);
251	}
252	r600_bytecode_build(shader_ctx.bc);
253	if (dump) {
254		r600_bytecode_dump(shader_ctx.bc);
255	}
256	return 1;
257}
258
259#endif /* HAVE_OPENCL */
260
261static uint32_t i32_from_byte_stream(unsigned char * bytes,
262		unsigned * bytes_read)
263{
264	unsigned i;
265	uint32_t out = 0;
266	for (i = 0; i < 4; i++) {
267		out |= bytes[(*bytes_read)++] << (8 * i);
268	}
269	return out;
270}
271
272static unsigned r600_src_from_byte_stream(unsigned char * bytes,
273		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
274{
275	unsigned i;
276	unsigned sel0, sel1;
277	sel0 = bytes[bytes_read++];
278	sel1 = bytes[bytes_read++];
279	alu->src[src_idx].sel = sel0 | (sel1 << 8);
280	alu->src[src_idx].chan = bytes[bytes_read++];
281	alu->src[src_idx].neg = bytes[bytes_read++];
282	alu->src[src_idx].abs = bytes[bytes_read++];
283	alu->src[src_idx].rel = bytes[bytes_read++];
284	alu->src[src_idx].kc_bank = bytes[bytes_read++];
285	for (i = 0; i < 4; i++) {
286		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
287	}
288	return bytes_read;
289}
290
291static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
292				unsigned char * bytes, unsigned bytes_read)
293{
294	unsigned src_idx;
295	unsigned inst0, inst1;
296	unsigned push_modifier;
297	struct r600_bytecode_alu alu;
298	memset(&alu, 0, sizeof(alu));
299	for(src_idx = 0; src_idx < 3; src_idx++) {
300		bytes_read = r600_src_from_byte_stream(bytes, bytes_read,
301								&alu, src_idx);
302	}
303
304	alu.dst.sel = bytes[bytes_read++];
305	alu.dst.chan = bytes[bytes_read++];
306	alu.dst.clamp = bytes[bytes_read++];
307	alu.dst.write = bytes[bytes_read++];
308	alu.dst.rel = bytes[bytes_read++];
309	inst0 = bytes[bytes_read++];
310	inst1 = bytes[bytes_read++];
311	alu.inst = inst0 | (inst1 << 8);
312	alu.last = bytes[bytes_read++];
313	alu.is_op3 = bytes[bytes_read++];
314	push_modifier = bytes[bytes_read++];
315	alu.pred_sel = bytes[bytes_read++];
316	alu.bank_swizzle = bytes[bytes_read++];
317	alu.bank_swizzle_force = bytes[bytes_read++];
318	alu.omod = bytes[bytes_read++];
319	alu.index_mode = bytes[bytes_read++];
320
321
322	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
323	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
324	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
325	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
326		alu.update_pred = 1;
327		alu.dst.write = 0;
328		alu.src[1].sel = V_SQ_ALU_SRC_0;
329		alu.src[1].chan = 0;
330		alu.last = 1;
331    }
332
333    if (push_modifier) {
334        alu.pred_sel = 0;
335		alu.execute_mask = 1;
336		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
337	} else
338		r600_bytecode_add_alu(ctx->bc, &alu);
339
340
341	/* XXX: Handle other KILL instructions */
342	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
343		ctx->shader->uses_kill = 1;
344		/* XXX: This should be enforced in the LLVM backend. */
345		ctx->bc->force_add_cf = 1;
346	}
347	return bytes_read;
348}
349
350static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
351	unsigned pred_inst)
352{
353	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
354	fc_pushlevel(ctx, FC_IF);
355	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
356}
357
358static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
359			struct r600_bytecode_alu *alu, unsigned compare_opcode)
360{
361	unsigned opcode = TGSI_OPCODE_BRK;
362	if (ctx->bc->chip_class == CAYMAN)
363		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
364	else if (ctx->bc->chip_class >= EVERGREEN)
365		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
366	else
367		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
368	llvm_if(ctx, alu, compare_opcode);
369	tgsi_loop_brk_cont(ctx);
370	tgsi_endif(ctx);
371}
372
373static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
374				unsigned char * bytes, unsigned bytes_read)
375{
376	struct r600_bytecode_alu alu;
377	unsigned inst;
378	memset(&alu, 0, sizeof(alu));
379	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
380	inst = bytes[bytes_read++];
381	switch (inst) {
382	case 0: /* FC_IF */
383		llvm_if(ctx, &alu,
384			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
385		break;
386	case 1: /* FC_IF_INT */
387		llvm_if(ctx, &alu,
388			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
389		break;
390	case 2: /* FC_ELSE */
391		tgsi_else(ctx);
392		break;
393	case 3: /* FC_ENDIF */
394		tgsi_endif(ctx);
395		break;
396	case 4: /* FC_BGNLOOP */
397		tgsi_bgnloop(ctx);
398		break;
399	case 5: /* FC_ENDLOOP */
400		tgsi_endloop(ctx);
401		break;
402	case 6: /* FC_BREAK */
403		r600_break_from_byte_stream(ctx, &alu,
404			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
405		break;
406	case 7: /* FC_BREAK_NZ_INT */
407		r600_break_from_byte_stream(ctx, &alu,
408			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
409		break;
410	case 8: /* FC_CONTINUE */
411		{
412			unsigned opcode = TGSI_OPCODE_CONT;
413			if (ctx->bc->chip_class == CAYMAN) {
414				ctx->inst_info =
415					&cm_shader_tgsi_instruction[opcode];
416			} else if (ctx->bc->chip_class >= EVERGREEN) {
417				ctx->inst_info =
418					&eg_shader_tgsi_instruction[opcode];
419			} else {
420				ctx->inst_info =
421					&r600_shader_tgsi_instruction[opcode];
422			}
423			tgsi_loop_brk_cont(ctx);
424		}
425		break;
426	case 9: /* FC_BREAK_Z_INT */
427		r600_break_from_byte_stream(ctx, &alu,
428			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
429		break;
430	case 10: /* FC_BREAK_NZ */
431		r600_break_from_byte_stream(ctx, &alu,
432			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
433		break;
434	}
435
436	return bytes_read;
437}
438
439static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
440				unsigned char * bytes, unsigned bytes_read)
441{
442	struct r600_bytecode_tex tex;
443
444	tex.inst = bytes[bytes_read++];
445	tex.resource_id = bytes[bytes_read++];
446	tex.src_gpr = bytes[bytes_read++];
447	tex.src_rel = bytes[bytes_read++];
448	tex.dst_gpr = bytes[bytes_read++];
449	tex.dst_rel = bytes[bytes_read++];
450	tex.dst_sel_x = bytes[bytes_read++];
451	tex.dst_sel_y = bytes[bytes_read++];
452	tex.dst_sel_z = bytes[bytes_read++];
453	tex.dst_sel_w = bytes[bytes_read++];
454	tex.lod_bias = bytes[bytes_read++];
455	tex.coord_type_x = bytes[bytes_read++];
456	tex.coord_type_y = bytes[bytes_read++];
457	tex.coord_type_z = bytes[bytes_read++];
458	tex.coord_type_w = bytes[bytes_read++];
459	tex.offset_x = bytes[bytes_read++];
460	tex.offset_y = bytes[bytes_read++];
461	tex.offset_z = bytes[bytes_read++];
462	tex.sampler_id = bytes[bytes_read++];
463	tex.src_sel_x = bytes[bytes_read++];
464	tex.src_sel_y = bytes[bytes_read++];
465	tex.src_sel_z = bytes[bytes_read++];
466	tex.src_sel_w = bytes[bytes_read++];
467
468	r600_bytecode_add_tex(ctx->bc, &tex);
469
470	return bytes_read;
471}
472
473static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
474	unsigned char * bytes, unsigned bytes_read)
475{
476	struct r600_bytecode_vtx vtx;
477
478	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
479        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
480	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
481
482	memset(&vtx, 0, sizeof(vtx));
483
484	/* WORD0 */
485	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
486	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
487	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
488	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
489	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
490	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
491
492	/* WORD1 */
493	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
494	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
495	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
496	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
497	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
498	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
499	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
500	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
501	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
502	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
503
504	/* WORD 2*/
505	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
506	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
507
508	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
509		fprintf(stderr, "Error adding vtx\n");
510	}
511	/* Use the Texture Cache */
512	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
513	return bytes_read;
514}
515
516static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
517				unsigned char * bytes,	unsigned num_bytes)
518{
519	unsigned bytes_read = 0;
520	unsigned i, byte;
521	while (bytes_read < num_bytes) {
522		char inst_type = bytes[bytes_read++];
523		switch (inst_type) {
524		case 0:
525			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
526								bytes_read);
527			break;
528		case 1:
529			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
530								bytes_read);
531			break;
532		case 2:
533			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
534								bytes_read);
535			break;
536		case 3:
537			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
538			for (i = 0; i < 2; i++) {
539				for (byte = 0 ; byte < 4; byte++) {
540					ctx->bc->cf_last->isa[i] |=
541					(bytes[bytes_read++] << (byte * 8));
542				}
543			}
544			break;
545
546		case 4:
547			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
548								bytes_read);
549			break;
550		default:
551			/* XXX: Error here */
552			break;
553		}
554	}
555}
556
557/* End bytestream -> r600 shader functions*/
558
559static int tgsi_is_supported(struct r600_shader_ctx *ctx)
560{
561	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
562	int j;
563
564	if (i->Instruction.NumDstRegs > 1) {
565		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
566		return -EINVAL;
567	}
568	if (i->Instruction.Predicate) {
569		R600_ERR("predicate unsupported\n");
570		return -EINVAL;
571	}
572#if 0
573	if (i->Instruction.Label) {
574		R600_ERR("label unsupported\n");
575		return -EINVAL;
576	}
577#endif
578	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
579		if (i->Src[j].Register.Dimension) {
580			R600_ERR("unsupported src %d (dimension %d)\n", j,
581				 i->Src[j].Register.Dimension);
582			return -EINVAL;
583		}
584	}
585	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
586		if (i->Dst[j].Register.Dimension) {
587			R600_ERR("unsupported dst (dimension)\n");
588			return -EINVAL;
589		}
590	}
591	return 0;
592}
593
594static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
595{
596	int i, r;
597	struct r600_bytecode_alu alu;
598	int gpr = 0, base_chan = 0;
599	int ij_index = 0;
600
601	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
602		ij_index = 0;
603		if (ctx->shader->input[input].centroid)
604			ij_index++;
605	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
606		ij_index = 0;
607		/* if we have perspective add one */
608		if (ctx->input_perspective)  {
609			ij_index++;
610			/* if we have perspective centroid */
611			if (ctx->input_centroid)
612				ij_index++;
613		}
614		if (ctx->shader->input[input].centroid)
615			ij_index++;
616	}
617
618	/* work out gpr and base_chan from index */
619	gpr = ij_index / 2;
620	base_chan = (2 * (ij_index % 2)) + 1;
621
622	for (i = 0; i < 8; i++) {
623		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
624
625		if (i < 4)
626			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
627		else
628			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
629
630		if ((i > 1) && (i < 6)) {
631			alu.dst.sel = ctx->shader->input[input].gpr;
632			alu.dst.write = 1;
633		}
634
635		alu.dst.chan = i % 4;
636
637		alu.src[0].sel = gpr;
638		alu.src[0].chan = (base_chan - (i % 2));
639
640		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
641
642		alu.bank_swizzle_force = SQ_ALU_VEC_210;
643		if ((i % 4) == 3)
644			alu.last = 1;
645		r = r600_bytecode_add_alu(ctx->bc, &alu);
646		if (r)
647			return r;
648	}
649	return 0;
650}
651
652static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
653{
654	int i, r;
655	struct r600_bytecode_alu alu;
656
657	for (i = 0; i < 4; i++) {
658		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
659
660		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
661
662		alu.dst.sel = ctx->shader->input[input].gpr;
663		alu.dst.write = 1;
664
665		alu.dst.chan = i;
666
667		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
668		alu.src[0].chan = i;
669
670		if (i == 3)
671			alu.last = 1;
672		r = r600_bytecode_add_alu(ctx->bc, &alu);
673		if (r)
674			return r;
675	}
676	return 0;
677}
678
679/*
680 * Special export handling in shaders
681 *
682 * shader export ARRAY_BASE for EXPORT_POS:
683 * 60 is position
684 * 61 is misc vector
685 * 62, 63 are clip distance vectors
686 *
687 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
688 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
689 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
690 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
691 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
692 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
693 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
694 * exclusive from render target index)
695 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
696 *
697 *
698 * shader export ARRAY_BASE for EXPORT_PIXEL:
699 * 0-7 CB targets
700 * 61 computed Z vector
701 *
702 * The use of the values exported in the computed Z vector are controlled
703 * by DB_SHADER_CONTROL:
704 * Z_EXPORT_ENABLE - Z as a float in RED
705 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
706 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
707 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
708 * DB_SOURCE_FORMAT - export control restrictions
709 *
710 */
711
712
713/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
714static int r600_spi_sid(struct r600_shader_io * io)
715{
716	int index, name = io->name;
717
718	/* These params are handled differently, they don't need
719	 * semantic indices, so we'll use 0 for them.
720	 */
721	if (name == TGSI_SEMANTIC_POSITION ||
722		name == TGSI_SEMANTIC_PSIZE ||
723		name == TGSI_SEMANTIC_FACE)
724		index = 0;
725	else {
726		if (name == TGSI_SEMANTIC_GENERIC) {
727			/* For generic params simply use sid from tgsi */
728			index = io->sid;
729		} else {
730			/* For non-generic params - pack name and sid into 8 bits */
731			index = 0x80 | (name<<3) | (io->sid);
732		}
733
734		/* Make sure that all really used indices have nonzero value, so
735		 * we can just compare it to 0 later instead of comparing the name
736		 * with different values to detect special cases. */
737		index++;
738	}
739
740	return index;
741};
742
743/* turn input into interpolate on EG */
744static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
745{
746	int r = 0;
747
748	if (ctx->shader->input[index].spi_sid) {
749		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
750		if (ctx->shader->input[index].interpolate > 0) {
751			r = evergreen_interp_alu(ctx, index);
752		} else {
753			r = evergreen_interp_flat(ctx, index);
754		}
755	}
756	return r;
757}
758
759static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
760{
761	struct r600_bytecode_alu alu;
762	int i, r;
763	int gpr_front = ctx->shader->input[front].gpr;
764	int gpr_back = ctx->shader->input[back].gpr;
765
766	for (i = 0; i < 4; i++) {
767		memset(&alu, 0, sizeof(alu));
768		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
769		alu.is_op3 = 1;
770		alu.dst.write = 1;
771		alu.dst.sel = gpr_front;
772		alu.src[0].sel = ctx->face_gpr;
773		alu.src[1].sel = gpr_front;
774		alu.src[2].sel = gpr_back;
775
776		alu.dst.chan = i;
777		alu.src[1].chan = i;
778		alu.src[2].chan = i;
779		alu.last = (i==3);
780
781		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
782			return r;
783	}
784
785	return 0;
786}
787
788static int tgsi_declaration(struct r600_shader_ctx *ctx)
789{
790	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
791	unsigned i;
792	int r;
793
794	switch (d->Declaration.File) {
795	case TGSI_FILE_INPUT:
796		i = ctx->shader->ninput++;
797		ctx->shader->input[i].name = d->Semantic.Name;
798		ctx->shader->input[i].sid = d->Semantic.Index;
799		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
800		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
801		ctx->shader->input[i].centroid = d->Interp.Centroid;
802		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
803		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
804			switch (ctx->shader->input[i].name) {
805			case TGSI_SEMANTIC_FACE:
806				ctx->face_gpr = ctx->shader->input[i].gpr;
807				break;
808			case TGSI_SEMANTIC_COLOR:
809				ctx->colors_used++;
810				break;
811			case TGSI_SEMANTIC_POSITION:
812				ctx->fragcoord_input = i;
813				break;
814			}
815			if (ctx->bc->chip_class >= EVERGREEN) {
816				if ((r = evergreen_interp_input(ctx, i)))
817					return r;
818			}
819		}
820		break;
821	case TGSI_FILE_OUTPUT:
822		i = ctx->shader->noutput++;
823		ctx->shader->output[i].name = d->Semantic.Name;
824		ctx->shader->output[i].sid = d->Semantic.Index;
825		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
826		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
827		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
828		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
829		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
830			switch (d->Semantic.Name) {
831			case TGSI_SEMANTIC_CLIPDIST:
832				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
833				break;
834			case TGSI_SEMANTIC_PSIZE:
835				ctx->shader->vs_out_misc_write = 1;
836				ctx->shader->vs_out_point_size = 1;
837				break;
838			case TGSI_SEMANTIC_CLIPVERTEX:
839				ctx->clip_vertex_write = TRUE;
840				ctx->cv_output = i;
841				break;
842			}
843		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
844			switch (d->Semantic.Name) {
845			case TGSI_SEMANTIC_COLOR:
846				ctx->shader->nr_ps_max_color_exports++;
847				break;
848			}
849		}
850		break;
851	case TGSI_FILE_CONSTANT:
852	case TGSI_FILE_TEMPORARY:
853	case TGSI_FILE_SAMPLER:
854	case TGSI_FILE_ADDRESS:
855		break;
856
857	case TGSI_FILE_SYSTEM_VALUE:
858		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
859			if (!ctx->native_integers) {
860				struct r600_bytecode_alu alu;
861				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
862
863				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
864				alu.src[0].sel = 0;
865				alu.src[0].chan = 3;
866
867				alu.dst.sel = 0;
868				alu.dst.chan = 3;
869				alu.dst.write = 1;
870				alu.last = 1;
871
872				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
873					return r;
874			}
875			break;
876		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
877			break;
878	default:
879		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
880		return -EINVAL;
881	}
882	return 0;
883}
884
885static int r600_get_temp(struct r600_shader_ctx *ctx)
886{
887	return ctx->temp_reg + ctx->max_driver_temp_used++;
888}
889
890/*
891 * for evergreen we need to scan the shader to find the number of GPRs we need to
892 * reserve for interpolation.
893 *
894 * we need to know if we are going to emit
895 * any centroid inputs
896 * if perspective and linear are required
897*/
898static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
899{
900	int i;
901	int num_baryc;
902
903	ctx->input_linear = FALSE;
904	ctx->input_perspective = FALSE;
905	ctx->input_centroid = FALSE;
906	ctx->num_interp_gpr = 1;
907
908	/* any centroid inputs */
909	for (i = 0; i < ctx->info.num_inputs; i++) {
910		/* skip position/face */
911		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
912		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
913			continue;
914		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
915			ctx->input_linear = TRUE;
916		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
917			ctx->input_perspective = TRUE;
918		if (ctx->info.input_centroid[i])
919			ctx->input_centroid = TRUE;
920	}
921
922	num_baryc = 0;
923	/* ignoring sample for now */
924	if (ctx->input_perspective)
925		num_baryc++;
926	if (ctx->input_linear)
927		num_baryc++;
928	if (ctx->input_centroid)
929		num_baryc *= 2;
930
931	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
932
933	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
934	return ctx->num_interp_gpr;
935}
936
937static void tgsi_src(struct r600_shader_ctx *ctx,
938		     const struct tgsi_full_src_register *tgsi_src,
939		     struct r600_shader_src *r600_src)
940{
941	memset(r600_src, 0, sizeof(*r600_src));
942	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
943	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
944	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
945	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
946	r600_src->neg = tgsi_src->Register.Negate;
947	r600_src->abs = tgsi_src->Register.Absolute;
948
949	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
950		int index;
951		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
952			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
953			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
954
955			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
956			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
957			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
958				return;
959		}
960		index = tgsi_src->Register.Index;
961		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
962		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
963	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
964		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
965			r600_src->swizzle[0] = 3;
966			r600_src->swizzle[1] = 3;
967			r600_src->swizzle[2] = 3;
968			r600_src->swizzle[3] = 3;
969			r600_src->sel = 0;
970		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
971			r600_src->swizzle[0] = 0;
972			r600_src->swizzle[1] = 0;
973			r600_src->swizzle[2] = 0;
974			r600_src->swizzle[3] = 0;
975			r600_src->sel = 0;
976		}
977	} else {
978		if (tgsi_src->Register.Indirect)
979			r600_src->rel = V_SQ_REL_RELATIVE;
980		r600_src->sel = tgsi_src->Register.Index;
981		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
982	}
983}
984
985static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
986{
987	struct r600_bytecode_vtx vtx;
988	unsigned int ar_reg;
989	int r;
990
991	if (offset) {
992		struct r600_bytecode_alu alu;
993
994		memset(&alu, 0, sizeof(alu));
995
996		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
997		alu.src[0].sel = ctx->bc->ar_reg;
998
999		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1000		alu.src[1].value = offset;
1001
1002		alu.dst.sel = dst_reg;
1003		alu.dst.write = 1;
1004		alu.last = 1;
1005
1006		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1007			return r;
1008
1009		ar_reg = dst_reg;
1010	} else {
1011		ar_reg = ctx->bc->ar_reg;
1012	}
1013
1014	memset(&vtx, 0, sizeof(vtx));
1015	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1016	vtx.src_gpr = ar_reg;
1017	vtx.mega_fetch_count = 16;
1018	vtx.dst_gpr = dst_reg;
1019	vtx.dst_sel_x = 0;		/* SEL_X */
1020	vtx.dst_sel_y = 1;		/* SEL_Y */
1021	vtx.dst_sel_z = 2;		/* SEL_Z */
1022	vtx.dst_sel_w = 3;		/* SEL_W */
1023	vtx.data_format = FMT_32_32_32_32_FLOAT;
1024	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1025	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1026	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1027	vtx.endian = r600_endian_swap(32);
1028
1029	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1030		return r;
1031
1032	return 0;
1033}
1034
1035static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1036{
1037	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1038	struct r600_bytecode_alu alu;
1039	int i, j, k, nconst, r;
1040
1041	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1042		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1043			nconst++;
1044		}
1045		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1046	}
1047	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1048		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1049			continue;
1050		}
1051
1052		if (ctx->src[i].rel) {
1053			int treg = r600_get_temp(ctx);
1054			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1055				return r;
1056
1057			ctx->src[i].sel = treg;
1058			ctx->src[i].rel = 0;
1059			j--;
1060		} else if (j > 0) {
1061			int treg = r600_get_temp(ctx);
1062			for (k = 0; k < 4; k++) {
1063				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1064				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1065				alu.src[0].sel = ctx->src[i].sel;
1066				alu.src[0].chan = k;
1067				alu.src[0].rel = ctx->src[i].rel;
1068				alu.dst.sel = treg;
1069				alu.dst.chan = k;
1070				alu.dst.write = 1;
1071				if (k == 3)
1072					alu.last = 1;
1073				r = r600_bytecode_add_alu(ctx->bc, &alu);
1074				if (r)
1075					return r;
1076			}
1077			ctx->src[i].sel = treg;
1078			ctx->src[i].rel =0;
1079			j--;
1080		}
1081	}
1082	return 0;
1083}
1084
1085/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1086static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1087{
1088	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1089	struct r600_bytecode_alu alu;
1090	int i, j, k, nliteral, r;
1091
1092	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1093		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1094			nliteral++;
1095		}
1096	}
1097	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1098		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1099			int treg = r600_get_temp(ctx);
1100			for (k = 0; k < 4; k++) {
1101				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1102				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1103				alu.src[0].sel = ctx->src[i].sel;
1104				alu.src[0].chan = k;
1105				alu.src[0].value = ctx->src[i].value[k];
1106				alu.dst.sel = treg;
1107				alu.dst.chan = k;
1108				alu.dst.write = 1;
1109				if (k == 3)
1110					alu.last = 1;
1111				r = r600_bytecode_add_alu(ctx->bc, &alu);
1112				if (r)
1113					return r;
1114			}
1115			ctx->src[i].sel = treg;
1116			j--;
1117		}
1118	}
1119	return 0;
1120}
1121
1122static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1123{
1124	int i, r, count = ctx->shader->ninput;
1125
1126	/* additional inputs will be allocated right after the existing inputs,
1127	 * we won't need them after the color selection, so we don't need to
1128	 * reserve these gprs for the rest of the shader code and to adjust
1129	 * output offsets etc. */
1130	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
1131			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
1132
1133	if (ctx->face_gpr == -1) {
1134		i = ctx->shader->ninput++;
1135		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
1136		ctx->shader->input[i].spi_sid = 0;
1137		ctx->shader->input[i].gpr = gpr++;
1138		ctx->face_gpr = ctx->shader->input[i].gpr;
1139	}
1140
1141	for (i = 0; i < count; i++) {
1142		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1143			int ni = ctx->shader->ninput++;
1144			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
1145			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1146			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
1147			ctx->shader->input[ni].gpr = gpr++;
1148
1149			if (ctx->bc->chip_class >= EVERGREEN) {
1150				r = evergreen_interp_input(ctx, ni);
1151				if (r)
1152					return r;
1153			}
1154
1155			r = select_twoside_color(ctx, i, ni);
1156			if (r)
1157				return r;
1158		}
1159	}
1160	return 0;
1161}
1162
1163static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader)
1164{
1165	struct r600_shader *shader = &pipeshader->shader;
1166	struct tgsi_token *tokens = pipeshader->selector->tokens;
1167	struct pipe_stream_output_info so = pipeshader->selector->so;
1168	struct tgsi_full_immediate *immediate;
1169	struct tgsi_full_property *property;
1170	struct r600_shader_ctx ctx;
1171	struct r600_bytecode_output output[32];
1172	unsigned output_done, noutput;
1173	unsigned opcode;
1174	int i, j, k, r = 0;
1175	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1176	/* Declarations used by llvm code */
1177	bool use_llvm = false;
1178	unsigned char * inst_bytes = NULL;
1179	unsigned inst_byte_count = 0;
1180
1181#ifdef R600_USE_LLVM
1182	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1183#endif
1184	ctx.bc = &shader->bc;
1185	ctx.shader = shader;
1186	ctx.native_integers = true;
1187
1188	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
1189	ctx.tokens = tokens;
1190	tgsi_scan_shader(tokens, &ctx.info);
1191	tgsi_parse_init(&ctx.parse, tokens);
1192	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1193	shader->processor_type = ctx.type;
1194	ctx.bc->type = shader->processor_type;
1195
1196	ctx.face_gpr = -1;
1197	ctx.fragcoord_input = -1;
1198	ctx.colors_used = 0;
1199	ctx.clip_vertex_write = 0;
1200
1201	shader->nr_ps_color_exports = 0;
1202	shader->nr_ps_max_color_exports = 0;
1203
1204	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
1205
1206	/* register allocations */
1207	/* Values [0,127] correspond to GPR[0..127].
1208	 * Values [128,159] correspond to constant buffer bank 0
1209	 * Values [160,191] correspond to constant buffer bank 1
1210	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1211	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1212	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1213	 * Other special values are shown in the list below.
1214	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1215	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1216	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1217	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1218	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1219	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1220	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1221	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1222	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1223	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1224	 * 254	SQ_ALU_SRC_PV: previous vector result.
1225	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1226	 */
1227	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1228		ctx.file_offset[i] = 0;
1229	}
1230	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1231		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1232		if (ctx.bc->chip_class >= EVERGREEN) {
1233			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1234		} else {
1235			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1236		}
1237	}
1238	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1239		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1240	}
1241
1242	/* LLVM backend setup */
1243#ifdef R600_USE_LLVM
1244	if (use_llvm && ctx.info.indirect_files) {
1245		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1246				"indirect adressing.  Falling back to TGSI "
1247				"backend.\n");
1248		use_llvm = 0;
1249	}
1250	if (use_llvm) {
1251		struct radeon_llvm_context radeon_llvm_ctx;
1252		LLVMModuleRef mod;
1253		unsigned dump = 0;
1254		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1255		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1256		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1257		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1258			dump = 1;
1259		}
1260		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1261							rctx->family, dump)) {
1262			FREE(inst_bytes);
1263			radeon_llvm_dispose(&radeon_llvm_ctx);
1264			use_llvm = 0;
1265			fprintf(stderr, "R600 LLVM backend failed to compile "
1266				"shader.  Falling back to TGSI\n");
1267		} else {
1268			ctx.file_offset[TGSI_FILE_OUTPUT] =
1269					ctx.file_offset[TGSI_FILE_INPUT];
1270		}
1271		radeon_llvm_dispose(&radeon_llvm_ctx);
1272	}
1273#endif
1274	/* End of LLVM backend setup */
1275
1276	if (!use_llvm) {
1277		ctx.file_offset[TGSI_FILE_OUTPUT] =
1278			ctx.file_offset[TGSI_FILE_INPUT] +
1279			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1280	}
1281	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1282						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1283
1284	/* Outside the GPR range. This will be translated to one of the
1285	 * kcache banks later. */
1286	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1287
1288	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1289	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1290			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1291	ctx.temp_reg = ctx.bc->ar_reg + 1;
1292
1293	ctx.nliterals = 0;
1294	ctx.literals = NULL;
1295	shader->fs_write_all = FALSE;
1296	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1297		tgsi_parse_token(&ctx.parse);
1298		switch (ctx.parse.FullToken.Token.Type) {
1299		case TGSI_TOKEN_TYPE_IMMEDIATE:
1300			immediate = &ctx.parse.FullToken.FullImmediate;
1301			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1302			if(ctx.literals == NULL) {
1303				r = -ENOMEM;
1304				goto out_err;
1305			}
1306			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1307			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1308			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1309			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1310			ctx.nliterals++;
1311			break;
1312		case TGSI_TOKEN_TYPE_DECLARATION:
1313			r = tgsi_declaration(&ctx);
1314			if (r)
1315				goto out_err;
1316			break;
1317		case TGSI_TOKEN_TYPE_INSTRUCTION:
1318			break;
1319		case TGSI_TOKEN_TYPE_PROPERTY:
1320			property = &ctx.parse.FullToken.FullProperty;
1321			switch (property->Property.PropertyName) {
1322			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1323				if (property->u[0].Data == 1)
1324					shader->fs_write_all = TRUE;
1325				break;
1326			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1327				if (property->u[0].Data == 1)
1328					shader->vs_prohibit_ucps = TRUE;
1329				break;
1330			}
1331			break;
1332		default:
1333			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1334			r = -EINVAL;
1335			goto out_err;
1336		}
1337	}
1338
1339	if (shader->fs_write_all && rctx->chip_class >= EVERGREEN)
1340		shader->nr_ps_max_color_exports = 8;
1341
1342	if (ctx.fragcoord_input >= 0) {
1343		if (ctx.bc->chip_class == CAYMAN) {
1344			for (j = 0 ; j < 4; j++) {
1345				struct r600_bytecode_alu alu;
1346				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1347				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1348				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1349				alu.src[0].chan = 3;
1350
1351				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1352				alu.dst.chan = j;
1353				alu.dst.write = (j == 3);
1354				alu.last = 1;
1355				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1356					return r;
1357			}
1358		} else {
1359			struct r600_bytecode_alu alu;
1360			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1361			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1362			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1363			alu.src[0].chan = 3;
1364
1365			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1366			alu.dst.chan = 3;
1367			alu.dst.write = 1;
1368			alu.last = 1;
1369			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1370				return r;
1371		}
1372	}
1373
1374	if (shader->two_side && ctx.colors_used) {
1375		if ((r = process_twoside_color_inputs(&ctx)))
1376			return r;
1377	}
1378
1379	tgsi_parse_init(&ctx.parse, tokens);
1380	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1381		tgsi_parse_token(&ctx.parse);
1382		switch (ctx.parse.FullToken.Token.Type) {
1383		case TGSI_TOKEN_TYPE_INSTRUCTION:
1384			if (use_llvm) {
1385				continue;
1386			}
1387			r = tgsi_is_supported(&ctx);
1388			if (r)
1389				goto out_err;
1390			ctx.max_driver_temp_used = 0;
1391			/* reserve first tmp for everyone */
1392			r600_get_temp(&ctx);
1393
1394			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1395			if ((r = tgsi_split_constant(&ctx)))
1396				goto out_err;
1397			if ((r = tgsi_split_literal_constant(&ctx)))
1398				goto out_err;
1399			if (ctx.bc->chip_class == CAYMAN)
1400				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1401			else if (ctx.bc->chip_class >= EVERGREEN)
1402				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1403			else
1404				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1405			r = ctx.inst_info->process(&ctx);
1406			if (r)
1407				goto out_err;
1408			break;
1409		default:
1410			break;
1411		}
1412	}
1413
1414	/* Get instructions if we are using the LLVM backend. */
1415	if (use_llvm) {
1416		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1417		FREE(inst_bytes);
1418	}
1419
1420	noutput = shader->noutput;
1421
1422	if (ctx.clip_vertex_write) {
1423		/* need to convert a clipvertex write into clipdistance writes and not export
1424		   the clip vertex anymore */
1425
1426		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1427		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1428		shader->output[noutput].gpr = ctx.temp_reg;
1429		noutput++;
1430		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1431		shader->output[noutput].gpr = ctx.temp_reg+1;
1432		noutput++;
1433
1434		/* reset spi_sid for clipvertex output to avoid confusing spi */
1435		shader->output[ctx.cv_output].spi_sid = 0;
1436
1437		shader->clip_dist_write = 0xFF;
1438
1439		for (i = 0; i < 8; i++) {
1440			int oreg = i >> 2;
1441			int ochan = i & 3;
1442
1443			for (j = 0; j < 4; j++) {
1444				struct r600_bytecode_alu alu;
1445				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1446				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1447				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1448				alu.src[0].chan = j;
1449
1450				alu.src[1].sel = 512 + i;
1451				alu.src[1].kc_bank = 1;
1452				alu.src[1].chan = j;
1453
1454				alu.dst.sel = ctx.temp_reg + oreg;
1455				alu.dst.chan = j;
1456				alu.dst.write = (j == ochan);
1457				if (j == 3)
1458					alu.last = 1;
1459				r = r600_bytecode_add_alu(ctx.bc, &alu);
1460				if (r)
1461					return r;
1462			}
1463		}
1464	}
1465
1466	/* Add stream outputs. */
1467	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1468		for (i = 0; i < so.num_outputs; i++) {
1469			struct r600_bytecode_output output;
1470
1471			if (so.output[i].output_buffer >= 4) {
1472				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1473					 so.output[i].output_buffer);
1474				r = -EINVAL;
1475				goto out_err;
1476			}
1477			if (so.output[i].dst_offset < so.output[i].start_component) {
1478			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1479			   r = -EINVAL;
1480			   goto out_err;
1481			}
1482
1483			memset(&output, 0, sizeof(struct r600_bytecode_output));
1484			output.gpr = shader->output[so.output[i].register_index].gpr;
1485			output.elem_size = 0;
1486			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1487			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1488			output.burst_count = 1;
1489			output.barrier = 1;
1490			/* array_size is an upper limit for the burst_count
1491			 * with MEM_STREAM instructions */
1492			output.array_size = 0xFFF;
1493			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1494			if (ctx.bc->chip_class >= EVERGREEN) {
1495				switch (so.output[i].output_buffer) {
1496				case 0:
1497					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1498					break;
1499				case 1:
1500					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1501					break;
1502				case 2:
1503					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1504					break;
1505				case 3:
1506					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1507					break;
1508				}
1509			} else {
1510				switch (so.output[i].output_buffer) {
1511				case 0:
1512					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1513					break;
1514				case 1:
1515					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1516					break;
1517				case 2:
1518					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1519					break;
1520				case 3:
1521					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1522					break;
1523				}
1524			}
1525			r = r600_bytecode_add_output(ctx.bc, &output);
1526			if (r)
1527				goto out_err;
1528		}
1529	}
1530
1531	/* export output */
1532	for (i = 0, j = 0; i < noutput; i++, j++) {
1533		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1534		output[j].gpr = shader->output[i].gpr;
1535		output[j].elem_size = 3;
1536		output[j].swizzle_x = 0;
1537		output[j].swizzle_y = 1;
1538		output[j].swizzle_z = 2;
1539		output[j].swizzle_w = 3;
1540		output[j].burst_count = 1;
1541		output[j].barrier = 1;
1542		output[j].type = -1;
1543		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1544		switch (ctx.type) {
1545		case TGSI_PROCESSOR_VERTEX:
1546			switch (shader->output[i].name) {
1547			case TGSI_SEMANTIC_POSITION:
1548				output[j].array_base = next_pos_base++;
1549				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1550				break;
1551
1552			case TGSI_SEMANTIC_PSIZE:
1553				output[j].array_base = next_pos_base++;
1554				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1555				break;
1556			case TGSI_SEMANTIC_CLIPVERTEX:
1557				j--;
1558				break;
1559			case TGSI_SEMANTIC_CLIPDIST:
1560				output[j].array_base = next_pos_base++;
1561				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1562				/* spi_sid is 0 for clipdistance outputs that were generated
1563				 * for clipvertex - we don't need to pass them to PS */
1564				if (shader->output[i].spi_sid) {
1565					j++;
1566					/* duplicate it as PARAM to pass to the pixel shader */
1567					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1568					output[j].array_base = next_param_base++;
1569					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1570				}
1571				break;
1572			case TGSI_SEMANTIC_FOG:
1573				output[j].swizzle_y = 4; /* 0 */
1574				output[j].swizzle_z = 4; /* 0 */
1575				output[j].swizzle_w = 5; /* 1 */
1576				break;
1577			}
1578			break;
1579		case TGSI_PROCESSOR_FRAGMENT:
1580			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1581				/* never export more colors than the number of CBs */
1582				if (next_pixel_base && next_pixel_base >= (rctx->nr_cbufs + rctx->dual_src_blend * 1)) {
1583					/* skip export */
1584					j--;
1585					continue;
1586				}
1587				output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1588				output[j].array_base = next_pixel_base++;
1589				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1590				shader->nr_ps_color_exports++;
1591				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
1592					for (k = 1; k < rctx->nr_cbufs; k++) {
1593						j++;
1594						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1595						output[j].gpr = shader->output[i].gpr;
1596						output[j].elem_size = 3;
1597						output[j].swizzle_x = 0;
1598						output[j].swizzle_y = 1;
1599						output[j].swizzle_z = 2;
1600						output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1601						output[j].burst_count = 1;
1602						output[j].barrier = 1;
1603						output[j].array_base = next_pixel_base++;
1604						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1605						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1606						shader->nr_ps_color_exports++;
1607					}
1608				}
1609			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1610				output[j].array_base = 61;
1611				output[j].swizzle_x = 2;
1612				output[j].swizzle_y = 7;
1613				output[j].swizzle_z = output[j].swizzle_w = 7;
1614				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1615			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1616				output[j].array_base = 61;
1617				output[j].swizzle_x = 7;
1618				output[j].swizzle_y = 1;
1619				output[j].swizzle_z = output[j].swizzle_w = 7;
1620				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1621			} else {
1622				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1623				r = -EINVAL;
1624				goto out_err;
1625			}
1626			break;
1627		default:
1628			R600_ERR("unsupported processor type %d\n", ctx.type);
1629			r = -EINVAL;
1630			goto out_err;
1631		}
1632
1633		if (output[j].type==-1) {
1634			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1635			output[j].array_base = next_param_base++;
1636		}
1637	}
1638
1639	/* add fake param output for vertex shader if no param is exported */
1640	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1641			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1642			output[j].gpr = 0;
1643			output[j].elem_size = 3;
1644			output[j].swizzle_x = 7;
1645			output[j].swizzle_y = 7;
1646			output[j].swizzle_z = 7;
1647			output[j].swizzle_w = 7;
1648			output[j].burst_count = 1;
1649			output[j].barrier = 1;
1650			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1651			output[j].array_base = 0;
1652			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1653			j++;
1654	}
1655
1656	/* add fake pixel export */
1657	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1658		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1659		output[j].gpr = 0;
1660		output[j].elem_size = 3;
1661		output[j].swizzle_x = 7;
1662		output[j].swizzle_y = 7;
1663		output[j].swizzle_z = 7;
1664		output[j].swizzle_w = 7;
1665		output[j].burst_count = 1;
1666		output[j].barrier = 1;
1667		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1668		output[j].array_base = 0;
1669		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1670		j++;
1671	}
1672
1673	noutput = j;
1674
1675	/* set export done on last export of each type */
1676	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1677		if (ctx.bc->chip_class < CAYMAN) {
1678			if (i == (noutput - 1)) {
1679				output[i].end_of_program = 1;
1680			}
1681		}
1682		if (!(output_done & (1 << output[i].type))) {
1683			output_done |= (1 << output[i].type);
1684			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1685		}
1686	}
1687	/* add output to bytecode */
1688	for (i = 0; i < noutput; i++) {
1689		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1690		if (r)
1691			goto out_err;
1692	}
1693	/* add program end */
1694	if (ctx.bc->chip_class == CAYMAN)
1695		cm_bytecode_add_cf_end(ctx.bc);
1696
1697	/* check GPR limit - we have 124 = 128 - 4
1698	 * (4 are reserved as alu clause temporary registers) */
1699	if (ctx.bc->ngpr > 124) {
1700		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1701		r = -ENOMEM;
1702		goto out_err;
1703	}
1704
1705	free(ctx.literals);
1706	tgsi_parse_free(&ctx.parse);
1707	return 0;
1708out_err:
1709	free(ctx.literals);
1710	tgsi_parse_free(&ctx.parse);
1711	return r;
1712}
1713
1714static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1715{
1716	R600_ERR("%s tgsi opcode unsupported\n",
1717		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1718	return -EINVAL;
1719}
1720
1721static int tgsi_end(struct r600_shader_ctx *ctx)
1722{
1723	return 0;
1724}
1725
1726static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1727			const struct r600_shader_src *shader_src,
1728			unsigned chan)
1729{
1730	bc_src->sel = shader_src->sel;
1731	bc_src->chan = shader_src->swizzle[chan];
1732	bc_src->neg = shader_src->neg;
1733	bc_src->abs = shader_src->abs;
1734	bc_src->rel = shader_src->rel;
1735	bc_src->value = shader_src->value[bc_src->chan];
1736}
1737
1738static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1739{
1740	bc_src->abs = 1;
1741	bc_src->neg = 0;
1742}
1743
1744static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1745{
1746	bc_src->neg = !bc_src->neg;
1747}
1748
1749static void tgsi_dst(struct r600_shader_ctx *ctx,
1750		     const struct tgsi_full_dst_register *tgsi_dst,
1751		     unsigned swizzle,
1752		     struct r600_bytecode_alu_dst *r600_dst)
1753{
1754	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1755
1756	r600_dst->sel = tgsi_dst->Register.Index;
1757	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1758	r600_dst->chan = swizzle;
1759	r600_dst->write = 1;
1760	if (tgsi_dst->Register.Indirect)
1761		r600_dst->rel = V_SQ_REL_RELATIVE;
1762	if (inst->Instruction.Saturate) {
1763		r600_dst->clamp = 1;
1764	}
1765}
1766
1767static int tgsi_last_instruction(unsigned writemask)
1768{
1769	int i, lasti = 0;
1770
1771	for (i = 0; i < 4; i++) {
1772		if (writemask & (1 << i)) {
1773			lasti = i;
1774		}
1775	}
1776	return lasti;
1777}
1778
1779static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1780{
1781	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1782	struct r600_bytecode_alu alu;
1783	int i, j, r;
1784	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1785
1786	for (i = 0; i < lasti + 1; i++) {
1787		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1788			continue;
1789
1790		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1791		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1792
1793		alu.inst = ctx->inst_info->r600_opcode;
1794		if (!swap) {
1795			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1796				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1797			}
1798		} else {
1799			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1800			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1801		}
1802		/* handle some special cases */
1803		switch (ctx->inst_info->tgsi_opcode) {
1804		case TGSI_OPCODE_SUB:
1805			r600_bytecode_src_toggle_neg(&alu.src[1]);
1806			break;
1807		case TGSI_OPCODE_ABS:
1808			r600_bytecode_src_set_abs(&alu.src[0]);
1809			break;
1810		default:
1811			break;
1812		}
1813		if (i == lasti || trans_only) {
1814			alu.last = 1;
1815		}
1816		r = r600_bytecode_add_alu(ctx->bc, &alu);
1817		if (r)
1818			return r;
1819	}
1820	return 0;
1821}
1822
1823static int tgsi_op2(struct r600_shader_ctx *ctx)
1824{
1825	return tgsi_op2_s(ctx, 0, 0);
1826}
1827
1828static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1829{
1830	return tgsi_op2_s(ctx, 1, 0);
1831}
1832
1833static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1834{
1835	return tgsi_op2_s(ctx, 0, 1);
1836}
1837
1838static int tgsi_ineg(struct r600_shader_ctx *ctx)
1839{
1840	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1841	struct r600_bytecode_alu alu;
1842	int i, r;
1843	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1844
1845	for (i = 0; i < lasti + 1; i++) {
1846
1847		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1848			continue;
1849		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1850		alu.inst = ctx->inst_info->r600_opcode;
1851
1852		alu.src[0].sel = V_SQ_ALU_SRC_0;
1853
1854		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1855
1856		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1857
1858		if (i == lasti) {
1859			alu.last = 1;
1860		}
1861		r = r600_bytecode_add_alu(ctx->bc, &alu);
1862		if (r)
1863			return r;
1864	}
1865	return 0;
1866
1867}
1868
1869static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1870{
1871	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1872	int i, j, r;
1873	struct r600_bytecode_alu alu;
1874	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1875
1876	for (i = 0 ; i < last_slot; i++) {
1877		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1878		alu.inst = ctx->inst_info->r600_opcode;
1879		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1880			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1881
1882			/* RSQ should take the absolute value of src */
1883			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
1884				r600_bytecode_src_set_abs(&alu.src[j]);
1885			}
1886		}
1887		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1888		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1889
1890		if (i == last_slot - 1)
1891			alu.last = 1;
1892		r = r600_bytecode_add_alu(ctx->bc, &alu);
1893		if (r)
1894			return r;
1895	}
1896	return 0;
1897}
1898
1899static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1900{
1901	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1902	int i, j, k, r;
1903	struct r600_bytecode_alu alu;
1904	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1905	for (k = 0; k < last_slot; k++) {
1906		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1907			continue;
1908
1909		for (i = 0 ; i < 4; i++) {
1910			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1911			alu.inst = ctx->inst_info->r600_opcode;
1912			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1913				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1914			}
1915			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1916			alu.dst.write = (i == k);
1917			if (i == 3)
1918				alu.last = 1;
1919			r = r600_bytecode_add_alu(ctx->bc, &alu);
1920			if (r)
1921				return r;
1922		}
1923	}
1924	return 0;
1925}
1926
1927/*
1928 * r600 - trunc to -PI..PI range
1929 * r700 - normalize by dividing by 2PI
1930 * see fdo bug 27901
1931 */
1932static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1933{
1934	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1935	static float double_pi = 3.1415926535 * 2;
1936	static float neg_pi = -3.1415926535;
1937
1938	int r;
1939	struct r600_bytecode_alu alu;
1940
1941	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1942	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1943	alu.is_op3 = 1;
1944
1945	alu.dst.chan = 0;
1946	alu.dst.sel = ctx->temp_reg;
1947	alu.dst.write = 1;
1948
1949	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1950
1951	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1952	alu.src[1].chan = 0;
1953	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1954	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1955	alu.src[2].chan = 0;
1956	alu.last = 1;
1957	r = r600_bytecode_add_alu(ctx->bc, &alu);
1958	if (r)
1959		return r;
1960
1961	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1962	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1963
1964	alu.dst.chan = 0;
1965	alu.dst.sel = ctx->temp_reg;
1966	alu.dst.write = 1;
1967
1968	alu.src[0].sel = ctx->temp_reg;
1969	alu.src[0].chan = 0;
1970	alu.last = 1;
1971	r = r600_bytecode_add_alu(ctx->bc, &alu);
1972	if (r)
1973		return r;
1974
1975	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1976	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1977	alu.is_op3 = 1;
1978
1979	alu.dst.chan = 0;
1980	alu.dst.sel = ctx->temp_reg;
1981	alu.dst.write = 1;
1982
1983	alu.src[0].sel = ctx->temp_reg;
1984	alu.src[0].chan = 0;
1985
1986	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1987	alu.src[1].chan = 0;
1988	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
1989	alu.src[2].chan = 0;
1990
1991	if (ctx->bc->chip_class == R600) {
1992		alu.src[1].value = *(uint32_t *)&double_pi;
1993		alu.src[2].value = *(uint32_t *)&neg_pi;
1994	} else {
1995		alu.src[1].sel = V_SQ_ALU_SRC_1;
1996		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1997		alu.src[2].neg = 1;
1998	}
1999
2000	alu.last = 1;
2001	r = r600_bytecode_add_alu(ctx->bc, &alu);
2002	if (r)
2003		return r;
2004	return 0;
2005}
2006
2007static int cayman_trig(struct r600_shader_ctx *ctx)
2008{
2009	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2010	struct r600_bytecode_alu alu;
2011	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2012	int i, r;
2013
2014	r = tgsi_setup_trig(ctx);
2015	if (r)
2016		return r;
2017
2018
2019	for (i = 0; i < last_slot; i++) {
2020		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2021		alu.inst = ctx->inst_info->r600_opcode;
2022		alu.dst.chan = i;
2023
2024		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2025		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2026
2027		alu.src[0].sel = ctx->temp_reg;
2028		alu.src[0].chan = 0;
2029		if (i == last_slot - 1)
2030			alu.last = 1;
2031		r = r600_bytecode_add_alu(ctx->bc, &alu);
2032		if (r)
2033			return r;
2034	}
2035	return 0;
2036}
2037
2038static int tgsi_trig(struct r600_shader_ctx *ctx)
2039{
2040	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2041	struct r600_bytecode_alu alu;
2042	int i, r;
2043	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2044
2045	r = tgsi_setup_trig(ctx);
2046	if (r)
2047		return r;
2048
2049	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2050	alu.inst = ctx->inst_info->r600_opcode;
2051	alu.dst.chan = 0;
2052	alu.dst.sel = ctx->temp_reg;
2053	alu.dst.write = 1;
2054
2055	alu.src[0].sel = ctx->temp_reg;
2056	alu.src[0].chan = 0;
2057	alu.last = 1;
2058	r = r600_bytecode_add_alu(ctx->bc, &alu);
2059	if (r)
2060		return r;
2061
2062	/* replicate result */
2063	for (i = 0; i < lasti + 1; i++) {
2064		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2065			continue;
2066
2067		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2068		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2069
2070		alu.src[0].sel = ctx->temp_reg;
2071		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2072		if (i == lasti)
2073			alu.last = 1;
2074		r = r600_bytecode_add_alu(ctx->bc, &alu);
2075		if (r)
2076			return r;
2077	}
2078	return 0;
2079}
2080
2081static int tgsi_scs(struct r600_shader_ctx *ctx)
2082{
2083	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2084	struct r600_bytecode_alu alu;
2085	int i, r;
2086
2087	/* We'll only need the trig stuff if we are going to write to the
2088	 * X or Y components of the destination vector.
2089	 */
2090	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2091		r = tgsi_setup_trig(ctx);
2092		if (r)
2093			return r;
2094	}
2095
2096	/* dst.x = COS */
2097	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2098		if (ctx->bc->chip_class == CAYMAN) {
2099			for (i = 0 ; i < 3; i++) {
2100				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2101				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2102				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2103
2104				if (i == 0)
2105					alu.dst.write = 1;
2106				else
2107					alu.dst.write = 0;
2108				alu.src[0].sel = ctx->temp_reg;
2109				alu.src[0].chan = 0;
2110				if (i == 2)
2111					alu.last = 1;
2112				r = r600_bytecode_add_alu(ctx->bc, &alu);
2113				if (r)
2114					return r;
2115			}
2116		} else {
2117			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2118			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2119			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2120
2121			alu.src[0].sel = ctx->temp_reg;
2122			alu.src[0].chan = 0;
2123			alu.last = 1;
2124			r = r600_bytecode_add_alu(ctx->bc, &alu);
2125			if (r)
2126				return r;
2127		}
2128	}
2129
2130	/* dst.y = SIN */
2131	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2132		if (ctx->bc->chip_class == CAYMAN) {
2133			for (i = 0 ; i < 3; i++) {
2134				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2135				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2136				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2137				if (i == 1)
2138					alu.dst.write = 1;
2139				else
2140					alu.dst.write = 0;
2141				alu.src[0].sel = ctx->temp_reg;
2142				alu.src[0].chan = 0;
2143				if (i == 2)
2144					alu.last = 1;
2145				r = r600_bytecode_add_alu(ctx->bc, &alu);
2146				if (r)
2147					return r;
2148			}
2149		} else {
2150			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2151			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2152			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2153
2154			alu.src[0].sel = ctx->temp_reg;
2155			alu.src[0].chan = 0;
2156			alu.last = 1;
2157			r = r600_bytecode_add_alu(ctx->bc, &alu);
2158			if (r)
2159				return r;
2160		}
2161	}
2162
2163	/* dst.z = 0.0; */
2164	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2165		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2166
2167		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2168
2169		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2170
2171		alu.src[0].sel = V_SQ_ALU_SRC_0;
2172		alu.src[0].chan = 0;
2173
2174		alu.last = 1;
2175
2176		r = r600_bytecode_add_alu(ctx->bc, &alu);
2177		if (r)
2178			return r;
2179	}
2180
2181	/* dst.w = 1.0; */
2182	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2183		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2184
2185		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2186
2187		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2188
2189		alu.src[0].sel = V_SQ_ALU_SRC_1;
2190		alu.src[0].chan = 0;
2191
2192		alu.last = 1;
2193
2194		r = r600_bytecode_add_alu(ctx->bc, &alu);
2195		if (r)
2196			return r;
2197	}
2198
2199	return 0;
2200}
2201
2202static int tgsi_kill(struct r600_shader_ctx *ctx)
2203{
2204	struct r600_bytecode_alu alu;
2205	int i, r;
2206
2207	for (i = 0; i < 4; i++) {
2208		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2209		alu.inst = ctx->inst_info->r600_opcode;
2210
2211		alu.dst.chan = i;
2212
2213		alu.src[0].sel = V_SQ_ALU_SRC_0;
2214
2215		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2216			alu.src[1].sel = V_SQ_ALU_SRC_1;
2217			alu.src[1].neg = 1;
2218		} else {
2219			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2220		}
2221		if (i == 3) {
2222			alu.last = 1;
2223		}
2224		r = r600_bytecode_add_alu(ctx->bc, &alu);
2225		if (r)
2226			return r;
2227	}
2228
2229	/* kill must be last in ALU */
2230	ctx->bc->force_add_cf = 1;
2231	ctx->shader->uses_kill = TRUE;
2232	return 0;
2233}
2234
2235static int tgsi_lit(struct r600_shader_ctx *ctx)
2236{
2237	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2238	struct r600_bytecode_alu alu;
2239	int r;
2240
2241	/* tmp.x = max(src.y, 0.0) */
2242	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2243	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2244	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2245	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2246	alu.src[1].chan = 1;
2247
2248	alu.dst.sel = ctx->temp_reg;
2249	alu.dst.chan = 0;
2250	alu.dst.write = 1;
2251
2252	alu.last = 1;
2253	r = r600_bytecode_add_alu(ctx->bc, &alu);
2254	if (r)
2255		return r;
2256
2257	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2258	{
2259		int chan;
2260		int sel;
2261		int i;
2262
2263		if (ctx->bc->chip_class == CAYMAN) {
2264			for (i = 0; i < 3; i++) {
2265				/* tmp.z = log(tmp.x) */
2266				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2267				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2268				alu.src[0].sel = ctx->temp_reg;
2269				alu.src[0].chan = 0;
2270				alu.dst.sel = ctx->temp_reg;
2271				alu.dst.chan = i;
2272				if (i == 2) {
2273					alu.dst.write = 1;
2274					alu.last = 1;
2275				} else
2276					alu.dst.write = 0;
2277
2278				r = r600_bytecode_add_alu(ctx->bc, &alu);
2279				if (r)
2280					return r;
2281			}
2282		} else {
2283			/* tmp.z = log(tmp.x) */
2284			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2285			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2286			alu.src[0].sel = ctx->temp_reg;
2287			alu.src[0].chan = 0;
2288			alu.dst.sel = ctx->temp_reg;
2289			alu.dst.chan = 2;
2290			alu.dst.write = 1;
2291			alu.last = 1;
2292			r = r600_bytecode_add_alu(ctx->bc, &alu);
2293			if (r)
2294				return r;
2295		}
2296
2297		chan = alu.dst.chan;
2298		sel = alu.dst.sel;
2299
2300		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2301		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2302		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2303		alu.src[0].sel  = sel;
2304		alu.src[0].chan = chan;
2305		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2306		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2307		alu.dst.sel = ctx->temp_reg;
2308		alu.dst.chan = 0;
2309		alu.dst.write = 1;
2310		alu.is_op3 = 1;
2311		alu.last = 1;
2312		r = r600_bytecode_add_alu(ctx->bc, &alu);
2313		if (r)
2314			return r;
2315
2316		if (ctx->bc->chip_class == CAYMAN) {
2317			for (i = 0; i < 3; i++) {
2318				/* dst.z = exp(tmp.x) */
2319				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2320				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2321				alu.src[0].sel = ctx->temp_reg;
2322				alu.src[0].chan = 0;
2323				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2324				if (i == 2) {
2325					alu.dst.write = 1;
2326					alu.last = 1;
2327				} else
2328					alu.dst.write = 0;
2329				r = r600_bytecode_add_alu(ctx->bc, &alu);
2330				if (r)
2331					return r;
2332			}
2333		} else {
2334			/* dst.z = exp(tmp.x) */
2335			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2336			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2337			alu.src[0].sel = ctx->temp_reg;
2338			alu.src[0].chan = 0;
2339			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2340			alu.last = 1;
2341			r = r600_bytecode_add_alu(ctx->bc, &alu);
2342			if (r)
2343				return r;
2344		}
2345	}
2346
2347	/* dst.x, <- 1.0  */
2348	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2349	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2350	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2351	alu.src[0].chan = 0;
2352	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2353	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2354	r = r600_bytecode_add_alu(ctx->bc, &alu);
2355	if (r)
2356		return r;
2357
2358	/* dst.y = max(src.x, 0.0) */
2359	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2360	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2361	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2362	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2363	alu.src[1].chan = 0;
2364	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2365	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2366	r = r600_bytecode_add_alu(ctx->bc, &alu);
2367	if (r)
2368		return r;
2369
2370	/* dst.w, <- 1.0  */
2371	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2372	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2373	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2374	alu.src[0].chan = 0;
2375	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2376	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2377	alu.last = 1;
2378	r = r600_bytecode_add_alu(ctx->bc, &alu);
2379	if (r)
2380		return r;
2381
2382	return 0;
2383}
2384
2385static int tgsi_rsq(struct r600_shader_ctx *ctx)
2386{
2387	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2388	struct r600_bytecode_alu alu;
2389	int i, r;
2390
2391	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2392
2393	/* XXX:
2394	 * For state trackers other than OpenGL, we'll want to use
2395	 * _RECIPSQRT_IEEE instead.
2396	 */
2397	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2398
2399	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2400		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2401		r600_bytecode_src_set_abs(&alu.src[i]);
2402	}
2403	alu.dst.sel = ctx->temp_reg;
2404	alu.dst.write = 1;
2405	alu.last = 1;
2406	r = r600_bytecode_add_alu(ctx->bc, &alu);
2407	if (r)
2408		return r;
2409	/* replicate result */
2410	return tgsi_helper_tempx_replicate(ctx);
2411}
2412
2413static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2414{
2415	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2416	struct r600_bytecode_alu alu;
2417	int i, r;
2418
2419	for (i = 0; i < 4; i++) {
2420		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2421		alu.src[0].sel = ctx->temp_reg;
2422		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2423		alu.dst.chan = i;
2424		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2425		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2426		if (i == 3)
2427			alu.last = 1;
2428		r = r600_bytecode_add_alu(ctx->bc, &alu);
2429		if (r)
2430			return r;
2431	}
2432	return 0;
2433}
2434
2435static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2436{
2437	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2438	struct r600_bytecode_alu alu;
2439	int i, r;
2440
2441	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2442	alu.inst = ctx->inst_info->r600_opcode;
2443	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2444		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2445	}
2446	alu.dst.sel = ctx->temp_reg;
2447	alu.dst.write = 1;
2448	alu.last = 1;
2449	r = r600_bytecode_add_alu(ctx->bc, &alu);
2450	if (r)
2451		return r;
2452	/* replicate result */
2453	return tgsi_helper_tempx_replicate(ctx);
2454}
2455
2456static int cayman_pow(struct r600_shader_ctx *ctx)
2457{
2458	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2459	int i, r;
2460	struct r600_bytecode_alu alu;
2461	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2462
2463	for (i = 0; i < 3; i++) {
2464		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2465		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2466		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2467		alu.dst.sel = ctx->temp_reg;
2468		alu.dst.chan = i;
2469		alu.dst.write = 1;
2470		if (i == 2)
2471			alu.last = 1;
2472		r = r600_bytecode_add_alu(ctx->bc, &alu);
2473		if (r)
2474			return r;
2475	}
2476
2477	/* b * LOG2(a) */
2478	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2479	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2480	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2481	alu.src[1].sel = ctx->temp_reg;
2482	alu.dst.sel = ctx->temp_reg;
2483	alu.dst.write = 1;
2484	alu.last = 1;
2485	r = r600_bytecode_add_alu(ctx->bc, &alu);
2486	if (r)
2487		return r;
2488
2489	for (i = 0; i < last_slot; i++) {
2490		/* POW(a,b) = EXP2(b * LOG2(a))*/
2491		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2492		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2493		alu.src[0].sel = ctx->temp_reg;
2494
2495		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2496		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2497		if (i == last_slot - 1)
2498			alu.last = 1;
2499		r = r600_bytecode_add_alu(ctx->bc, &alu);
2500		if (r)
2501			return r;
2502	}
2503	return 0;
2504}
2505
2506static int tgsi_pow(struct r600_shader_ctx *ctx)
2507{
2508	struct r600_bytecode_alu alu;
2509	int r;
2510
2511	/* LOG2(a) */
2512	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2513	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2514	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2515	alu.dst.sel = ctx->temp_reg;
2516	alu.dst.write = 1;
2517	alu.last = 1;
2518	r = r600_bytecode_add_alu(ctx->bc, &alu);
2519	if (r)
2520		return r;
2521	/* b * LOG2(a) */
2522	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2523	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2524	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2525	alu.src[1].sel = ctx->temp_reg;
2526	alu.dst.sel = ctx->temp_reg;
2527	alu.dst.write = 1;
2528	alu.last = 1;
2529	r = r600_bytecode_add_alu(ctx->bc, &alu);
2530	if (r)
2531		return r;
2532	/* POW(a,b) = EXP2(b * LOG2(a))*/
2533	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2534	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2535	alu.src[0].sel = ctx->temp_reg;
2536	alu.dst.sel = ctx->temp_reg;
2537	alu.dst.write = 1;
2538	alu.last = 1;
2539	r = r600_bytecode_add_alu(ctx->bc, &alu);
2540	if (r)
2541		return r;
2542	return tgsi_helper_tempx_replicate(ctx);
2543}
2544
2545static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2546{
2547	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2548	struct r600_bytecode_alu alu;
2549	int i, r, j;
2550	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2551	int tmp0 = ctx->temp_reg;
2552	int tmp1 = r600_get_temp(ctx);
2553	int tmp2 = r600_get_temp(ctx);
2554	int tmp3 = r600_get_temp(ctx);
2555	/* Unsigned path:
2556	 *
2557	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2558	 *
2559	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2560	 * 2. tmp0.z = lo (tmp0.x * src2)
2561	 * 3. tmp0.w = -tmp0.z
2562	 * 4. tmp0.y = hi (tmp0.x * src2)
2563	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2564	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2565	 * 7. tmp1.x = tmp0.x - tmp0.w
2566	 * 8. tmp1.y = tmp0.x + tmp0.w
2567	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2568	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2569	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2570	 *
2571	 * 12. tmp0.w = src1 - tmp0.y       = r
2572	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2573	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2574	 *
2575	 * if DIV
2576	 *
2577	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2578	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2579	 *
2580	 * else MOD
2581	 *
2582	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2583	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2584	 *
2585	 * endif
2586	 *
2587	 * 17. tmp1.x = tmp1.x & tmp1.y
2588	 *
2589	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2590	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2591	 *
2592	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2593	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2594	 *
2595	 * Signed path:
2596	 *
2597	 * Same as unsigned, using abs values of the operands,
2598	 * and fixing the sign of the result in the end.
2599	 */
2600
2601	for (i = 0; i < 4; i++) {
2602		if (!(write_mask & (1<<i)))
2603			continue;
2604
2605		if (signed_op) {
2606
2607			/* tmp2.x = -src0 */
2608			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2609			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2610
2611			alu.dst.sel = tmp2;
2612			alu.dst.chan = 0;
2613			alu.dst.write = 1;
2614
2615			alu.src[0].sel = V_SQ_ALU_SRC_0;
2616
2617			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2618
2619			alu.last = 1;
2620			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2621				return r;
2622
2623			/* tmp2.y = -src1 */
2624			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2625			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2626
2627			alu.dst.sel = tmp2;
2628			alu.dst.chan = 1;
2629			alu.dst.write = 1;
2630
2631			alu.src[0].sel = V_SQ_ALU_SRC_0;
2632
2633			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2634
2635			alu.last = 1;
2636			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2637				return r;
2638
2639			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2640			/* it will be a sign of the quotient */
2641			if (!mod) {
2642
2643				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2644				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2645
2646				alu.dst.sel = tmp2;
2647				alu.dst.chan = 2;
2648				alu.dst.write = 1;
2649
2650				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2651				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2652
2653				alu.last = 1;
2654				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2655					return r;
2656			}
2657
2658			/* tmp2.x = |src0| */
2659			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2660			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2661			alu.is_op3 = 1;
2662
2663			alu.dst.sel = tmp2;
2664			alu.dst.chan = 0;
2665			alu.dst.write = 1;
2666
2667			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2668			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2669			alu.src[2].sel = tmp2;
2670			alu.src[2].chan = 0;
2671
2672			alu.last = 1;
2673			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2674				return r;
2675
2676			/* tmp2.y = |src1| */
2677			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2678			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2679			alu.is_op3 = 1;
2680
2681			alu.dst.sel = tmp2;
2682			alu.dst.chan = 1;
2683			alu.dst.write = 1;
2684
2685			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2686			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2687			alu.src[2].sel = tmp2;
2688			alu.src[2].chan = 1;
2689
2690			alu.last = 1;
2691			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2692				return r;
2693
2694		}
2695
2696		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2697		if (ctx->bc->chip_class == CAYMAN) {
2698			/* tmp3.x = u2f(src2) */
2699			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2700			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2701
2702			alu.dst.sel = tmp3;
2703			alu.dst.chan = 0;
2704			alu.dst.write = 1;
2705
2706			if (signed_op) {
2707				alu.src[0].sel = tmp2;
2708				alu.src[0].chan = 1;
2709			} else {
2710				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2711			}
2712
2713			alu.last = 1;
2714			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2715				return r;
2716
2717			/* tmp0.x = recip(tmp3.x) */
2718			for (j = 0 ; j < 3; j++) {
2719				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2720				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2721
2722				alu.dst.sel = tmp0;
2723				alu.dst.chan = j;
2724				alu.dst.write = (j == 0);
2725
2726				alu.src[0].sel = tmp3;
2727				alu.src[0].chan = 0;
2728
2729				if (j == 2)
2730					alu.last = 1;
2731				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2732					return r;
2733			}
2734
2735			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2736			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2737
2738			alu.src[0].sel = tmp0;
2739			alu.src[0].chan = 0;
2740
2741			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2742			alu.src[1].value = 0x4f800000;
2743
2744			alu.dst.sel = tmp3;
2745			alu.dst.write = 1;
2746			alu.last = 1;
2747			r = r600_bytecode_add_alu(ctx->bc, &alu);
2748			if (r)
2749				return r;
2750
2751			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2752			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2753
2754			alu.dst.sel = tmp0;
2755			alu.dst.chan = 0;
2756			alu.dst.write = 1;
2757
2758			alu.src[0].sel = tmp3;
2759			alu.src[0].chan = 0;
2760
2761			alu.last = 1;
2762			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2763				return r;
2764
2765		} else {
2766			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2767			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2768
2769			alu.dst.sel = tmp0;
2770			alu.dst.chan = 0;
2771			alu.dst.write = 1;
2772
2773			if (signed_op) {
2774				alu.src[0].sel = tmp2;
2775				alu.src[0].chan = 1;
2776			} else {
2777				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2778			}
2779
2780			alu.last = 1;
2781			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2782				return r;
2783		}
2784
2785		/* 2. tmp0.z = lo (tmp0.x * src2) */
2786		if (ctx->bc->chip_class == CAYMAN) {
2787			for (j = 0 ; j < 4; j++) {
2788				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2789				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2790
2791				alu.dst.sel = tmp0;
2792				alu.dst.chan = j;
2793				alu.dst.write = (j == 2);
2794
2795				alu.src[0].sel = tmp0;
2796				alu.src[0].chan = 0;
2797				if (signed_op) {
2798					alu.src[1].sel = tmp2;
2799					alu.src[1].chan = 1;
2800				} else {
2801					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2802				}
2803
2804				alu.last = (j == 3);
2805				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2806					return r;
2807			}
2808		} else {
2809			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2810			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2811
2812			alu.dst.sel = tmp0;
2813			alu.dst.chan = 2;
2814			alu.dst.write = 1;
2815
2816			alu.src[0].sel = tmp0;
2817			alu.src[0].chan = 0;
2818			if (signed_op) {
2819				alu.src[1].sel = tmp2;
2820				alu.src[1].chan = 1;
2821			} else {
2822				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2823			}
2824
2825			alu.last = 1;
2826			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2827				return r;
2828		}
2829
2830		/* 3. tmp0.w = -tmp0.z */
2831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2832		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2833
2834		alu.dst.sel = tmp0;
2835		alu.dst.chan = 3;
2836		alu.dst.write = 1;
2837
2838		alu.src[0].sel = V_SQ_ALU_SRC_0;
2839		alu.src[1].sel = tmp0;
2840		alu.src[1].chan = 2;
2841
2842		alu.last = 1;
2843		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2844			return r;
2845
2846		/* 4. tmp0.y = hi (tmp0.x * src2) */
2847		if (ctx->bc->chip_class == CAYMAN) {
2848			for (j = 0 ; j < 4; j++) {
2849				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2850				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2851
2852				alu.dst.sel = tmp0;
2853				alu.dst.chan = j;
2854				alu.dst.write = (j == 1);
2855
2856				alu.src[0].sel = tmp0;
2857				alu.src[0].chan = 0;
2858
2859				if (signed_op) {
2860					alu.src[1].sel = tmp2;
2861					alu.src[1].chan = 1;
2862				} else {
2863					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2864				}
2865				alu.last = (j == 3);
2866				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2867					return r;
2868			}
2869		} else {
2870			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2871			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2872
2873			alu.dst.sel = tmp0;
2874			alu.dst.chan = 1;
2875			alu.dst.write = 1;
2876
2877			alu.src[0].sel = tmp0;
2878			alu.src[0].chan = 0;
2879
2880			if (signed_op) {
2881				alu.src[1].sel = tmp2;
2882				alu.src[1].chan = 1;
2883			} else {
2884				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2885			}
2886
2887			alu.last = 1;
2888			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2889				return r;
2890		}
2891
2892		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2893		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2894		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2895		alu.is_op3 = 1;
2896
2897		alu.dst.sel = tmp0;
2898		alu.dst.chan = 2;
2899		alu.dst.write = 1;
2900
2901		alu.src[0].sel = tmp0;
2902		alu.src[0].chan = 1;
2903		alu.src[1].sel = tmp0;
2904		alu.src[1].chan = 3;
2905		alu.src[2].sel = tmp0;
2906		alu.src[2].chan = 2;
2907
2908		alu.last = 1;
2909		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2910			return r;
2911
2912		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2913		if (ctx->bc->chip_class == CAYMAN) {
2914			for (j = 0 ; j < 4; j++) {
2915				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2916				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2917
2918				alu.dst.sel = tmp0;
2919				alu.dst.chan = j;
2920				alu.dst.write = (j == 3);
2921
2922				alu.src[0].sel = tmp0;
2923				alu.src[0].chan = 2;
2924
2925				alu.src[1].sel = tmp0;
2926				alu.src[1].chan = 0;
2927
2928				alu.last = (j == 3);
2929				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2930					return r;
2931			}
2932		} else {
2933			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2934			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2935
2936			alu.dst.sel = tmp0;
2937			alu.dst.chan = 3;
2938			alu.dst.write = 1;
2939
2940			alu.src[0].sel = tmp0;
2941			alu.src[0].chan = 2;
2942
2943			alu.src[1].sel = tmp0;
2944			alu.src[1].chan = 0;
2945
2946			alu.last = 1;
2947			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2948				return r;
2949		}
2950
2951		/* 7. tmp1.x = tmp0.x - tmp0.w */
2952		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2953		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2954
2955		alu.dst.sel = tmp1;
2956		alu.dst.chan = 0;
2957		alu.dst.write = 1;
2958
2959		alu.src[0].sel = tmp0;
2960		alu.src[0].chan = 0;
2961		alu.src[1].sel = tmp0;
2962		alu.src[1].chan = 3;
2963
2964		alu.last = 1;
2965		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2966			return r;
2967
2968		/* 8. tmp1.y = tmp0.x + tmp0.w */
2969		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2970		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
2971
2972		alu.dst.sel = tmp1;
2973		alu.dst.chan = 1;
2974		alu.dst.write = 1;
2975
2976		alu.src[0].sel = tmp0;
2977		alu.src[0].chan = 0;
2978		alu.src[1].sel = tmp0;
2979		alu.src[1].chan = 3;
2980
2981		alu.last = 1;
2982		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2983			return r;
2984
2985		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
2986		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2987		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2988		alu.is_op3 = 1;
2989
2990		alu.dst.sel = tmp0;
2991		alu.dst.chan = 0;
2992		alu.dst.write = 1;
2993
2994		alu.src[0].sel = tmp0;
2995		alu.src[0].chan = 1;
2996		alu.src[1].sel = tmp1;
2997		alu.src[1].chan = 1;
2998		alu.src[2].sel = tmp1;
2999		alu.src[2].chan = 0;
3000
3001		alu.last = 1;
3002		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3003			return r;
3004
3005		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3006		if (ctx->bc->chip_class == CAYMAN) {
3007			for (j = 0 ; j < 4; j++) {
3008				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3009				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3010
3011				alu.dst.sel = tmp0;
3012				alu.dst.chan = j;
3013				alu.dst.write = (j == 2);
3014
3015				alu.src[0].sel = tmp0;
3016				alu.src[0].chan = 0;
3017
3018				if (signed_op) {
3019					alu.src[1].sel = tmp2;
3020					alu.src[1].chan = 0;
3021				} else {
3022					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3023				}
3024
3025				alu.last = (j == 3);
3026				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3027					return r;
3028			}
3029		} else {
3030			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3031			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3032
3033			alu.dst.sel = tmp0;
3034			alu.dst.chan = 2;
3035			alu.dst.write = 1;
3036
3037			alu.src[0].sel = tmp0;
3038			alu.src[0].chan = 0;
3039
3040			if (signed_op) {
3041				alu.src[1].sel = tmp2;
3042				alu.src[1].chan = 0;
3043			} else {
3044				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3045			}
3046
3047			alu.last = 1;
3048			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3049				return r;
3050		}
3051
3052		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3053		if (ctx->bc->chip_class == CAYMAN) {
3054			for (j = 0 ; j < 4; j++) {
3055				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3056				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3057
3058				alu.dst.sel = tmp0;
3059				alu.dst.chan = j;
3060				alu.dst.write = (j == 1);
3061
3062				if (signed_op) {
3063					alu.src[0].sel = tmp2;
3064					alu.src[0].chan = 1;
3065				} else {
3066					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3067				}
3068
3069				alu.src[1].sel = tmp0;
3070				alu.src[1].chan = 2;
3071
3072				alu.last = (j == 3);
3073				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3074					return r;
3075			}
3076		} else {
3077			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3078			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3079
3080			alu.dst.sel = tmp0;
3081			alu.dst.chan = 1;
3082			alu.dst.write = 1;
3083
3084			if (signed_op) {
3085				alu.src[0].sel = tmp2;
3086				alu.src[0].chan = 1;
3087			} else {
3088				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3089			}
3090
3091			alu.src[1].sel = tmp0;
3092			alu.src[1].chan = 2;
3093
3094			alu.last = 1;
3095			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3096				return r;
3097		}
3098
3099		/* 12. tmp0.w = src1 - tmp0.y       = r */
3100		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3101		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3102
3103		alu.dst.sel = tmp0;
3104		alu.dst.chan = 3;
3105		alu.dst.write = 1;
3106
3107		if (signed_op) {
3108			alu.src[0].sel = tmp2;
3109			alu.src[0].chan = 0;
3110		} else {
3111			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3112		}
3113
3114		alu.src[1].sel = tmp0;
3115		alu.src[1].chan = 1;
3116
3117		alu.last = 1;
3118		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3119			return r;
3120
3121		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3122		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3123		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3124
3125		alu.dst.sel = tmp1;
3126		alu.dst.chan = 0;
3127		alu.dst.write = 1;
3128
3129		alu.src[0].sel = tmp0;
3130		alu.src[0].chan = 3;
3131		if (signed_op) {
3132			alu.src[1].sel = tmp2;
3133			alu.src[1].chan = 1;
3134		} else {
3135			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3136		}
3137
3138		alu.last = 1;
3139		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3140			return r;
3141
3142		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3143		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3144		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3145
3146		alu.dst.sel = tmp1;
3147		alu.dst.chan = 1;
3148		alu.dst.write = 1;
3149
3150		if (signed_op) {
3151			alu.src[0].sel = tmp2;
3152			alu.src[0].chan = 0;
3153		} else {
3154			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3155		}
3156
3157		alu.src[1].sel = tmp0;
3158		alu.src[1].chan = 1;
3159
3160		alu.last = 1;
3161		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3162			return r;
3163
3164		if (mod) { /* UMOD */
3165
3166			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3167			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3168			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3169
3170			alu.dst.sel = tmp1;
3171			alu.dst.chan = 2;
3172			alu.dst.write = 1;
3173
3174			alu.src[0].sel = tmp0;
3175			alu.src[0].chan = 3;
3176
3177			if (signed_op) {
3178				alu.src[1].sel = tmp2;
3179				alu.src[1].chan = 1;
3180			} else {
3181				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3182			}
3183
3184			alu.last = 1;
3185			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3186				return r;
3187
3188			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3189			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3190			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3191
3192			alu.dst.sel = tmp1;
3193			alu.dst.chan = 3;
3194			alu.dst.write = 1;
3195
3196			alu.src[0].sel = tmp0;
3197			alu.src[0].chan = 3;
3198			if (signed_op) {
3199				alu.src[1].sel = tmp2;
3200				alu.src[1].chan = 1;
3201			} else {
3202				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3203			}
3204
3205			alu.last = 1;
3206			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3207				return r;
3208
3209		} else { /* UDIV */
3210
3211			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3212			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3213			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3214
3215			alu.dst.sel = tmp1;
3216			alu.dst.chan = 2;
3217			alu.dst.write = 1;
3218
3219			alu.src[0].sel = tmp0;
3220			alu.src[0].chan = 2;
3221			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3222
3223			alu.last = 1;
3224			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3225				return r;
3226
3227			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3228			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3229			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3230
3231			alu.dst.sel = tmp1;
3232			alu.dst.chan = 3;
3233			alu.dst.write = 1;
3234
3235			alu.src[0].sel = tmp0;
3236			alu.src[0].chan = 2;
3237			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3238
3239			alu.last = 1;
3240			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3241				return r;
3242
3243		}
3244
3245		/* 17. tmp1.x = tmp1.x & tmp1.y */
3246		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3247		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3248
3249		alu.dst.sel = tmp1;
3250		alu.dst.chan = 0;
3251		alu.dst.write = 1;
3252
3253		alu.src[0].sel = tmp1;
3254		alu.src[0].chan = 0;
3255		alu.src[1].sel = tmp1;
3256		alu.src[1].chan = 1;
3257
3258		alu.last = 1;
3259		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3260			return r;
3261
3262		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3263		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3264		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3265		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3266		alu.is_op3 = 1;
3267
3268		alu.dst.sel = tmp0;
3269		alu.dst.chan = 2;
3270		alu.dst.write = 1;
3271
3272		alu.src[0].sel = tmp1;
3273		alu.src[0].chan = 0;
3274		alu.src[1].sel = tmp0;
3275		alu.src[1].chan = mod ? 3 : 2;
3276		alu.src[2].sel = tmp1;
3277		alu.src[2].chan = 2;
3278
3279		alu.last = 1;
3280		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3281			return r;
3282
3283		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3284		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3285		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3286		alu.is_op3 = 1;
3287
3288		if (signed_op) {
3289			alu.dst.sel = tmp0;
3290			alu.dst.chan = 2;
3291			alu.dst.write = 1;
3292		} else {
3293			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3294		}
3295
3296		alu.src[0].sel = tmp1;
3297		alu.src[0].chan = 1;
3298		alu.src[1].sel = tmp1;
3299		alu.src[1].chan = 3;
3300		alu.src[2].sel = tmp0;
3301		alu.src[2].chan = 2;
3302
3303		alu.last = 1;
3304		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3305			return r;
3306
3307		if (signed_op) {
3308
3309			/* fix the sign of the result */
3310
3311			if (mod) {
3312
3313				/* tmp0.x = -tmp0.z */
3314				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3315				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3316
3317				alu.dst.sel = tmp0;
3318				alu.dst.chan = 0;
3319				alu.dst.write = 1;
3320
3321				alu.src[0].sel = V_SQ_ALU_SRC_0;
3322				alu.src[1].sel = tmp0;
3323				alu.src[1].chan = 2;
3324
3325				alu.last = 1;
3326				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3327					return r;
3328
3329				/* sign of the remainder is the same as the sign of src0 */
3330				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3331				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3332				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3333				alu.is_op3 = 1;
3334
3335				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3336
3337				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3338				alu.src[1].sel = tmp0;
3339				alu.src[1].chan = 2;
3340				alu.src[2].sel = tmp0;
3341				alu.src[2].chan = 0;
3342
3343				alu.last = 1;
3344				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3345					return r;
3346
3347			} else {
3348
3349				/* tmp0.x = -tmp0.z */
3350				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3351				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3352
3353				alu.dst.sel = tmp0;
3354				alu.dst.chan = 0;
3355				alu.dst.write = 1;
3356
3357				alu.src[0].sel = V_SQ_ALU_SRC_0;
3358				alu.src[1].sel = tmp0;
3359				alu.src[1].chan = 2;
3360
3361				alu.last = 1;
3362				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3363					return r;
3364
3365				/* fix the quotient sign (same as the sign of src0*src1) */
3366				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3367				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3368				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3369				alu.is_op3 = 1;
3370
3371				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3372
3373				alu.src[0].sel = tmp2;
3374				alu.src[0].chan = 2;
3375				alu.src[1].sel = tmp0;
3376				alu.src[1].chan = 2;
3377				alu.src[2].sel = tmp0;
3378				alu.src[2].chan = 0;
3379
3380				alu.last = 1;
3381				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3382					return r;
3383			}
3384		}
3385	}
3386	return 0;
3387}
3388
3389static int tgsi_udiv(struct r600_shader_ctx *ctx)
3390{
3391	return tgsi_divmod(ctx, 0, 0);
3392}
3393
3394static int tgsi_umod(struct r600_shader_ctx *ctx)
3395{
3396	return tgsi_divmod(ctx, 1, 0);
3397}
3398
3399static int tgsi_idiv(struct r600_shader_ctx *ctx)
3400{
3401	return tgsi_divmod(ctx, 0, 1);
3402}
3403
3404static int tgsi_imod(struct r600_shader_ctx *ctx)
3405{
3406	return tgsi_divmod(ctx, 1, 1);
3407}
3408
3409
3410static int tgsi_f2i(struct r600_shader_ctx *ctx)
3411{
3412	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3413	struct r600_bytecode_alu alu;
3414	int i, r;
3415	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3416	int last_inst = tgsi_last_instruction(write_mask);
3417
3418	for (i = 0; i < 4; i++) {
3419		if (!(write_mask & (1<<i)))
3420			continue;
3421
3422		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3423		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3424
3425		alu.dst.sel = ctx->temp_reg;
3426		alu.dst.chan = i;
3427		alu.dst.write = 1;
3428
3429		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3430		if (i == last_inst)
3431			alu.last = 1;
3432		r = r600_bytecode_add_alu(ctx->bc, &alu);
3433		if (r)
3434			return r;
3435	}
3436
3437	for (i = 0; i < 4; i++) {
3438		if (!(write_mask & (1<<i)))
3439			continue;
3440
3441		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3442		alu.inst = ctx->inst_info->r600_opcode;
3443
3444		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3445
3446		alu.src[0].sel = ctx->temp_reg;
3447		alu.src[0].chan = i;
3448
3449		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3450			alu.last = 1;
3451		r = r600_bytecode_add_alu(ctx->bc, &alu);
3452		if (r)
3453			return r;
3454	}
3455
3456	return 0;
3457}
3458
3459static int tgsi_iabs(struct r600_shader_ctx *ctx)
3460{
3461	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3462	struct r600_bytecode_alu alu;
3463	int i, r;
3464	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3465	int last_inst = tgsi_last_instruction(write_mask);
3466
3467	/* tmp = -src */
3468	for (i = 0; i < 4; i++) {
3469		if (!(write_mask & (1<<i)))
3470			continue;
3471
3472		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3473		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3474
3475		alu.dst.sel = ctx->temp_reg;
3476		alu.dst.chan = i;
3477		alu.dst.write = 1;
3478
3479		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3480		alu.src[0].sel = V_SQ_ALU_SRC_0;
3481
3482		if (i == last_inst)
3483			alu.last = 1;
3484		r = r600_bytecode_add_alu(ctx->bc, &alu);
3485		if (r)
3486			return r;
3487	}
3488
3489	/* dst = (src >= 0 ? src : tmp) */
3490	for (i = 0; i < 4; i++) {
3491		if (!(write_mask & (1<<i)))
3492			continue;
3493
3494		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3495		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3496		alu.is_op3 = 1;
3497		alu.dst.write = 1;
3498
3499		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3500
3501		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3502		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3503		alu.src[2].sel = ctx->temp_reg;
3504		alu.src[2].chan = i;
3505
3506		if (i == last_inst)
3507			alu.last = 1;
3508		r = r600_bytecode_add_alu(ctx->bc, &alu);
3509		if (r)
3510			return r;
3511	}
3512	return 0;
3513}
3514
3515static int tgsi_issg(struct r600_shader_ctx *ctx)
3516{
3517	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3518	struct r600_bytecode_alu alu;
3519	int i, r;
3520	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3521	int last_inst = tgsi_last_instruction(write_mask);
3522
3523	/* tmp = (src >= 0 ? src : -1) */
3524	for (i = 0; i < 4; i++) {
3525		if (!(write_mask & (1<<i)))
3526			continue;
3527
3528		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3529		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3530		alu.is_op3 = 1;
3531
3532		alu.dst.sel = ctx->temp_reg;
3533		alu.dst.chan = i;
3534		alu.dst.write = 1;
3535
3536		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3537		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3538		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3539
3540		if (i == last_inst)
3541			alu.last = 1;
3542		r = r600_bytecode_add_alu(ctx->bc, &alu);
3543		if (r)
3544			return r;
3545	}
3546
3547	/* dst = (tmp > 0 ? 1 : tmp) */
3548	for (i = 0; i < 4; i++) {
3549		if (!(write_mask & (1<<i)))
3550			continue;
3551
3552		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3553		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3554		alu.is_op3 = 1;
3555		alu.dst.write = 1;
3556
3557		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3558
3559		alu.src[0].sel = ctx->temp_reg;
3560		alu.src[0].chan = i;
3561
3562		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3563
3564		alu.src[2].sel = ctx->temp_reg;
3565		alu.src[2].chan = i;
3566
3567		if (i == last_inst)
3568			alu.last = 1;
3569		r = r600_bytecode_add_alu(ctx->bc, &alu);
3570		if (r)
3571			return r;
3572	}
3573	return 0;
3574}
3575
3576
3577
3578static int tgsi_ssg(struct r600_shader_ctx *ctx)
3579{
3580	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3581	struct r600_bytecode_alu alu;
3582	int i, r;
3583
3584	/* tmp = (src > 0 ? 1 : src) */
3585	for (i = 0; i < 4; i++) {
3586		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3587		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3588		alu.is_op3 = 1;
3589
3590		alu.dst.sel = ctx->temp_reg;
3591		alu.dst.chan = i;
3592
3593		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3594		alu.src[1].sel = V_SQ_ALU_SRC_1;
3595		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3596
3597		if (i == 3)
3598			alu.last = 1;
3599		r = r600_bytecode_add_alu(ctx->bc, &alu);
3600		if (r)
3601			return r;
3602	}
3603
3604	/* dst = (-tmp > 0 ? -1 : tmp) */
3605	for (i = 0; i < 4; i++) {
3606		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3607		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3608		alu.is_op3 = 1;
3609		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3610
3611		alu.src[0].sel = ctx->temp_reg;
3612		alu.src[0].chan = i;
3613		alu.src[0].neg = 1;
3614
3615		alu.src[1].sel = V_SQ_ALU_SRC_1;
3616		alu.src[1].neg = 1;
3617
3618		alu.src[2].sel = ctx->temp_reg;
3619		alu.src[2].chan = i;
3620
3621		if (i == 3)
3622			alu.last = 1;
3623		r = r600_bytecode_add_alu(ctx->bc, &alu);
3624		if (r)
3625			return r;
3626	}
3627	return 0;
3628}
3629
3630static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3631{
3632	struct r600_bytecode_alu alu;
3633	int i, r;
3634
3635	for (i = 0; i < 4; i++) {
3636		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3637		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3638			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3639			alu.dst.chan = i;
3640		} else {
3641			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3642			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3643			alu.src[0].sel = ctx->temp_reg;
3644			alu.src[0].chan = i;
3645		}
3646		if (i == 3) {
3647			alu.last = 1;
3648		}
3649		r = r600_bytecode_add_alu(ctx->bc, &alu);
3650		if (r)
3651			return r;
3652	}
3653	return 0;
3654}
3655
3656static int tgsi_op3(struct r600_shader_ctx *ctx)
3657{
3658	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3659	struct r600_bytecode_alu alu;
3660	int i, j, r;
3661	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3662
3663	for (i = 0; i < lasti + 1; i++) {
3664		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3665			continue;
3666
3667		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3668		alu.inst = ctx->inst_info->r600_opcode;
3669		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3670			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3671		}
3672
3673		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3674		alu.dst.chan = i;
3675		alu.dst.write = 1;
3676		alu.is_op3 = 1;
3677		if (i == lasti) {
3678			alu.last = 1;
3679		}
3680		r = r600_bytecode_add_alu(ctx->bc, &alu);
3681		if (r)
3682			return r;
3683	}
3684	return 0;
3685}
3686
3687static int tgsi_dp(struct r600_shader_ctx *ctx)
3688{
3689	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3690	struct r600_bytecode_alu alu;
3691	int i, j, r;
3692
3693	for (i = 0; i < 4; i++) {
3694		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3695		alu.inst = ctx->inst_info->r600_opcode;
3696		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3697			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3698		}
3699
3700		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3701		alu.dst.chan = i;
3702		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3703		/* handle some special cases */
3704		switch (ctx->inst_info->tgsi_opcode) {
3705		case TGSI_OPCODE_DP2:
3706			if (i > 1) {
3707				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3708				alu.src[0].chan = alu.src[1].chan = 0;
3709			}
3710			break;
3711		case TGSI_OPCODE_DP3:
3712			if (i > 2) {
3713				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3714				alu.src[0].chan = alu.src[1].chan = 0;
3715			}
3716			break;
3717		case TGSI_OPCODE_DPH:
3718			if (i == 3) {
3719				alu.src[0].sel = V_SQ_ALU_SRC_1;
3720				alu.src[0].chan = 0;
3721				alu.src[0].neg = 0;
3722			}
3723			break;
3724		default:
3725			break;
3726		}
3727		if (i == 3) {
3728			alu.last = 1;
3729		}
3730		r = r600_bytecode_add_alu(ctx->bc, &alu);
3731		if (r)
3732			return r;
3733	}
3734	return 0;
3735}
3736
3737static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3738						    unsigned index)
3739{
3740	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3741	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3742		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3743		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3744		ctx->src[index].neg || ctx->src[index].abs;
3745}
3746
3747static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3748					unsigned index)
3749{
3750	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3751	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3752}
3753
3754static int tgsi_tex(struct r600_shader_ctx *ctx)
3755{
3756	static float one_point_five = 1.5f;
3757	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3758	struct r600_bytecode_tex tex;
3759	struct r600_bytecode_alu alu;
3760	unsigned src_gpr;
3761	int r, i, j;
3762	int opcode;
3763	/* Texture fetch instructions can only use gprs as source.
3764	 * Also they cannot negate the source or take the absolute value */
3765	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3766                                             tgsi_tex_src_requires_loading(ctx, 0);
3767	boolean src_loaded = FALSE;
3768	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3769	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3770
3771	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3772
3773	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3774		/* get offset values */
3775		if (inst->Texture.NumOffsets) {
3776			assert(inst->Texture.NumOffsets == 1);
3777
3778			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3779			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3780			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3781		}
3782	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3783		/* TGSI moves the sampler to src reg 3 for TXD */
3784		sampler_src_reg = 3;
3785
3786		for (i = 1; i < 3; i++) {
3787			/* set gradients h/v */
3788			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3789			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3790				SQ_TEX_INST_SET_GRADIENTS_V;
3791			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3792			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3793
3794			if (tgsi_tex_src_requires_loading(ctx, i)) {
3795				tex.src_gpr = r600_get_temp(ctx);
3796				tex.src_sel_x = 0;
3797				tex.src_sel_y = 1;
3798				tex.src_sel_z = 2;
3799				tex.src_sel_w = 3;
3800
3801				for (j = 0; j < 4; j++) {
3802					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3803					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3804                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3805                                        alu.dst.sel = tex.src_gpr;
3806                                        alu.dst.chan = j;
3807                                        if (j == 3)
3808                                                alu.last = 1;
3809                                        alu.dst.write = 1;
3810                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3811                                        if (r)
3812                                                return r;
3813				}
3814
3815			} else {
3816				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3817				tex.src_sel_x = ctx->src[i].swizzle[0];
3818				tex.src_sel_y = ctx->src[i].swizzle[1];
3819				tex.src_sel_z = ctx->src[i].swizzle[2];
3820				tex.src_sel_w = ctx->src[i].swizzle[3];
3821				tex.src_rel = ctx->src[i].rel;
3822			}
3823			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3824			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3825			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3826				tex.coord_type_x = 1;
3827				tex.coord_type_y = 1;
3828				tex.coord_type_z = 1;
3829				tex.coord_type_w = 1;
3830			}
3831			r = r600_bytecode_add_tex(ctx->bc, &tex);
3832			if (r)
3833				return r;
3834		}
3835	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3836		int out_chan;
3837		/* Add perspective divide */
3838		if (ctx->bc->chip_class == CAYMAN) {
3839			out_chan = 2;
3840			for (i = 0; i < 3; i++) {
3841				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3842				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3843				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3844
3845				alu.dst.sel = ctx->temp_reg;
3846				alu.dst.chan = i;
3847				if (i == 2)
3848					alu.last = 1;
3849				if (out_chan == i)
3850					alu.dst.write = 1;
3851				r = r600_bytecode_add_alu(ctx->bc, &alu);
3852				if (r)
3853					return r;
3854			}
3855
3856		} else {
3857			out_chan = 3;
3858			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3859			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3860			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3861
3862			alu.dst.sel = ctx->temp_reg;
3863			alu.dst.chan = out_chan;
3864			alu.last = 1;
3865			alu.dst.write = 1;
3866			r = r600_bytecode_add_alu(ctx->bc, &alu);
3867			if (r)
3868				return r;
3869		}
3870
3871		for (i = 0; i < 3; i++) {
3872			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3873			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3874			alu.src[0].sel = ctx->temp_reg;
3875			alu.src[0].chan = out_chan;
3876			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3877			alu.dst.sel = ctx->temp_reg;
3878			alu.dst.chan = i;
3879			alu.dst.write = 1;
3880			r = r600_bytecode_add_alu(ctx->bc, &alu);
3881			if (r)
3882				return r;
3883		}
3884		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3885		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3886		alu.src[0].sel = V_SQ_ALU_SRC_1;
3887		alu.src[0].chan = 0;
3888		alu.dst.sel = ctx->temp_reg;
3889		alu.dst.chan = 3;
3890		alu.last = 1;
3891		alu.dst.write = 1;
3892		r = r600_bytecode_add_alu(ctx->bc, &alu);
3893		if (r)
3894			return r;
3895		src_loaded = TRUE;
3896		src_gpr = ctx->temp_reg;
3897	}
3898
3899	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3900	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3901	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3902	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3903
3904		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3905		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3906
3907		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3908		for (i = 0; i < 4; i++) {
3909			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3910			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3911			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3912			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3913			alu.dst.sel = ctx->temp_reg;
3914			alu.dst.chan = i;
3915			if (i == 3)
3916				alu.last = 1;
3917			alu.dst.write = 1;
3918			r = r600_bytecode_add_alu(ctx->bc, &alu);
3919			if (r)
3920				return r;
3921		}
3922
3923		/* tmp1.z = RCP_e(|tmp1.z|) */
3924		if (ctx->bc->chip_class == CAYMAN) {
3925			for (i = 0; i < 3; i++) {
3926				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3927				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3928				alu.src[0].sel = ctx->temp_reg;
3929				alu.src[0].chan = 2;
3930				alu.src[0].abs = 1;
3931				alu.dst.sel = ctx->temp_reg;
3932				alu.dst.chan = i;
3933				if (i == 2)
3934					alu.dst.write = 1;
3935				if (i == 2)
3936					alu.last = 1;
3937				r = r600_bytecode_add_alu(ctx->bc, &alu);
3938				if (r)
3939					return r;
3940			}
3941		} else {
3942			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3943			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3944			alu.src[0].sel = ctx->temp_reg;
3945			alu.src[0].chan = 2;
3946			alu.src[0].abs = 1;
3947			alu.dst.sel = ctx->temp_reg;
3948			alu.dst.chan = 2;
3949			alu.dst.write = 1;
3950			alu.last = 1;
3951			r = r600_bytecode_add_alu(ctx->bc, &alu);
3952			if (r)
3953				return r;
3954		}
3955
3956		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3957		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3958		 * muladd has no writemask, have to use another temp
3959		 */
3960		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3961		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3962		alu.is_op3 = 1;
3963
3964		alu.src[0].sel = ctx->temp_reg;
3965		alu.src[0].chan = 0;
3966		alu.src[1].sel = ctx->temp_reg;
3967		alu.src[1].chan = 2;
3968
3969		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3970		alu.src[2].chan = 0;
3971		alu.src[2].value = *(uint32_t *)&one_point_five;
3972
3973		alu.dst.sel = ctx->temp_reg;
3974		alu.dst.chan = 0;
3975		alu.dst.write = 1;
3976
3977		r = r600_bytecode_add_alu(ctx->bc, &alu);
3978		if (r)
3979			return r;
3980
3981		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3982		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3983		alu.is_op3 = 1;
3984
3985		alu.src[0].sel = ctx->temp_reg;
3986		alu.src[0].chan = 1;
3987		alu.src[1].sel = ctx->temp_reg;
3988		alu.src[1].chan = 2;
3989
3990		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3991		alu.src[2].chan = 0;
3992		alu.src[2].value = *(uint32_t *)&one_point_five;
3993
3994		alu.dst.sel = ctx->temp_reg;
3995		alu.dst.chan = 1;
3996		alu.dst.write = 1;
3997
3998		alu.last = 1;
3999		r = r600_bytecode_add_alu(ctx->bc, &alu);
4000		if (r)
4001			return r;
4002		/* write initial W value into Z component */
4003		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4004			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4005			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4006			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4007			alu.dst.sel = ctx->temp_reg;
4008			alu.dst.chan = 2;
4009			alu.dst.write = 1;
4010			alu.last = 1;
4011			r = r600_bytecode_add_alu(ctx->bc, &alu);
4012			if (r)
4013				return r;
4014		}
4015
4016		/* for cube forms of lod and bias we need to route the lod
4017		   value into Z */
4018		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
4019		    inst->Instruction.Opcode == TGSI_OPCODE_TXL) {
4020			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4021			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4022			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4023			alu.dst.sel = ctx->temp_reg;
4024			alu.dst.chan = 2;
4025			alu.last = 1;
4026			alu.dst.write = 1;
4027			r = r600_bytecode_add_alu(ctx->bc, &alu);
4028			if (r)
4029				return r;
4030		}
4031
4032		src_loaded = TRUE;
4033		src_gpr = ctx->temp_reg;
4034	}
4035
4036	if (src_requires_loading && !src_loaded) {
4037		for (i = 0; i < 4; i++) {
4038			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4039			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4040			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4041			alu.dst.sel = ctx->temp_reg;
4042			alu.dst.chan = i;
4043			if (i == 3)
4044				alu.last = 1;
4045			alu.dst.write = 1;
4046			r = r600_bytecode_add_alu(ctx->bc, &alu);
4047			if (r)
4048				return r;
4049		}
4050		src_loaded = TRUE;
4051		src_gpr = ctx->temp_reg;
4052	}
4053
4054	opcode = ctx->inst_info->r600_opcode;
4055	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4056	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4057	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4058	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4059	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4060	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4061		switch (opcode) {
4062		case SQ_TEX_INST_SAMPLE:
4063			opcode = SQ_TEX_INST_SAMPLE_C;
4064			break;
4065		case SQ_TEX_INST_SAMPLE_L:
4066			opcode = SQ_TEX_INST_SAMPLE_C_L;
4067			break;
4068		case SQ_TEX_INST_SAMPLE_LB:
4069			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4070			break;
4071		case SQ_TEX_INST_SAMPLE_G:
4072			opcode = SQ_TEX_INST_SAMPLE_C_G;
4073			break;
4074		}
4075	}
4076
4077	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4078	tex.inst = opcode;
4079
4080	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4081	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4082	tex.src_gpr = src_gpr;
4083	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4084	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4085	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4086	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4087	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4088
4089	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4090		tex.src_sel_x = 4;
4091		tex.src_sel_y = 4;
4092		tex.src_sel_z = 4;
4093		tex.src_sel_w = 4;
4094	} else if (src_loaded) {
4095		tex.src_sel_x = 0;
4096		tex.src_sel_y = 1;
4097		tex.src_sel_z = 2;
4098		tex.src_sel_w = 3;
4099	} else {
4100		tex.src_sel_x = ctx->src[0].swizzle[0];
4101		tex.src_sel_y = ctx->src[0].swizzle[1];
4102		tex.src_sel_z = ctx->src[0].swizzle[2];
4103		tex.src_sel_w = ctx->src[0].swizzle[3];
4104		tex.src_rel = ctx->src[0].rel;
4105	}
4106
4107	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4108	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4109		tex.src_sel_x = 1;
4110		tex.src_sel_y = 0;
4111		tex.src_sel_z = 3;
4112		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
4113	}
4114
4115	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4116	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4117		tex.coord_type_x = 1;
4118		tex.coord_type_y = 1;
4119	}
4120	tex.coord_type_z = 1;
4121	tex.coord_type_w = 1;
4122
4123	tex.offset_x = offset_x;
4124	tex.offset_y = offset_y;
4125	tex.offset_z = offset_z;
4126
4127	/* Put the depth for comparison in W.
4128	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4129	 * Some instructions expect the depth in Z. */
4130	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4131	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4132	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4133	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4134	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4135	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4136		tex.src_sel_w = tex.src_sel_z;
4137	}
4138
4139	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4140	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4141		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4142		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4143			/* the array index is read from Y */
4144			tex.coord_type_y = 0;
4145		} else {
4146			/* the array index is read from Z */
4147			tex.coord_type_z = 0;
4148			tex.src_sel_z = tex.src_sel_y;
4149		}
4150	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4151		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4152		/* the array index is read from Z */
4153		tex.coord_type_z = 0;
4154
4155	r = r600_bytecode_add_tex(ctx->bc, &tex);
4156	if (r)
4157		return r;
4158
4159	/* add shadow ambient support  - gallium doesn't do it yet */
4160	return 0;
4161}
4162
4163static int tgsi_lrp(struct r600_shader_ctx *ctx)
4164{
4165	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4166	struct r600_bytecode_alu alu;
4167	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4168	unsigned i;
4169	int r;
4170
4171	/* optimize if it's just an equal balance */
4172	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4173		for (i = 0; i < lasti + 1; i++) {
4174			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4175				continue;
4176
4177			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4178			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4179			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4180			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4181			alu.omod = 3;
4182			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4183			alu.dst.chan = i;
4184			if (i == lasti) {
4185				alu.last = 1;
4186			}
4187			r = r600_bytecode_add_alu(ctx->bc, &alu);
4188			if (r)
4189				return r;
4190		}
4191		return 0;
4192	}
4193
4194	/* 1 - src0 */
4195	for (i = 0; i < lasti + 1; i++) {
4196		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4197			continue;
4198
4199		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4200		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4201		alu.src[0].sel = V_SQ_ALU_SRC_1;
4202		alu.src[0].chan = 0;
4203		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4204		r600_bytecode_src_toggle_neg(&alu.src[1]);
4205		alu.dst.sel = ctx->temp_reg;
4206		alu.dst.chan = i;
4207		if (i == lasti) {
4208			alu.last = 1;
4209		}
4210		alu.dst.write = 1;
4211		r = r600_bytecode_add_alu(ctx->bc, &alu);
4212		if (r)
4213			return r;
4214	}
4215
4216	/* (1 - src0) * src2 */
4217	for (i = 0; i < lasti + 1; i++) {
4218		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4219			continue;
4220
4221		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4222		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4223		alu.src[0].sel = ctx->temp_reg;
4224		alu.src[0].chan = i;
4225		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4226		alu.dst.sel = ctx->temp_reg;
4227		alu.dst.chan = i;
4228		if (i == lasti) {
4229			alu.last = 1;
4230		}
4231		alu.dst.write = 1;
4232		r = r600_bytecode_add_alu(ctx->bc, &alu);
4233		if (r)
4234			return r;
4235	}
4236
4237	/* src0 * src1 + (1 - src0) * src2 */
4238	for (i = 0; i < lasti + 1; i++) {
4239		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4240			continue;
4241
4242		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4243		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4244		alu.is_op3 = 1;
4245		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4246		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4247		alu.src[2].sel = ctx->temp_reg;
4248		alu.src[2].chan = i;
4249
4250		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4251		alu.dst.chan = i;
4252		if (i == lasti) {
4253			alu.last = 1;
4254		}
4255		r = r600_bytecode_add_alu(ctx->bc, &alu);
4256		if (r)
4257			return r;
4258	}
4259	return 0;
4260}
4261
4262static int tgsi_cmp(struct r600_shader_ctx *ctx)
4263{
4264	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4265	struct r600_bytecode_alu alu;
4266	int i, r;
4267	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4268
4269	for (i = 0; i < lasti + 1; i++) {
4270		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4271			continue;
4272
4273		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4274		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4275		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4276		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4277		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4278		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4279		alu.dst.chan = i;
4280		alu.dst.write = 1;
4281		alu.is_op3 = 1;
4282		if (i == lasti)
4283			alu.last = 1;
4284		r = r600_bytecode_add_alu(ctx->bc, &alu);
4285		if (r)
4286			return r;
4287	}
4288	return 0;
4289}
4290
4291static int tgsi_xpd(struct r600_shader_ctx *ctx)
4292{
4293	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4294	static const unsigned int src0_swizzle[] = {2, 0, 1};
4295	static const unsigned int src1_swizzle[] = {1, 2, 0};
4296	struct r600_bytecode_alu alu;
4297	uint32_t use_temp = 0;
4298	int i, r;
4299
4300	if (inst->Dst[0].Register.WriteMask != 0xf)
4301		use_temp = 1;
4302
4303	for (i = 0; i < 4; i++) {
4304		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4305		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4306		if (i < 3) {
4307			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4308			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4309		} else {
4310			alu.src[0].sel = V_SQ_ALU_SRC_0;
4311			alu.src[0].chan = i;
4312			alu.src[1].sel = V_SQ_ALU_SRC_0;
4313			alu.src[1].chan = i;
4314		}
4315
4316		alu.dst.sel = ctx->temp_reg;
4317		alu.dst.chan = i;
4318		alu.dst.write = 1;
4319
4320		if (i == 3)
4321			alu.last = 1;
4322		r = r600_bytecode_add_alu(ctx->bc, &alu);
4323		if (r)
4324			return r;
4325	}
4326
4327	for (i = 0; i < 4; i++) {
4328		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4329		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4330
4331		if (i < 3) {
4332			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4333			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4334		} else {
4335			alu.src[0].sel = V_SQ_ALU_SRC_0;
4336			alu.src[0].chan = i;
4337			alu.src[1].sel = V_SQ_ALU_SRC_0;
4338			alu.src[1].chan = i;
4339		}
4340
4341		alu.src[2].sel = ctx->temp_reg;
4342		alu.src[2].neg = 1;
4343		alu.src[2].chan = i;
4344
4345		if (use_temp)
4346			alu.dst.sel = ctx->temp_reg;
4347		else
4348			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4349		alu.dst.chan = i;
4350		alu.dst.write = 1;
4351		alu.is_op3 = 1;
4352		if (i == 3)
4353			alu.last = 1;
4354		r = r600_bytecode_add_alu(ctx->bc, &alu);
4355		if (r)
4356			return r;
4357	}
4358	if (use_temp)
4359		return tgsi_helper_copy(ctx, inst);
4360	return 0;
4361}
4362
4363static int tgsi_exp(struct r600_shader_ctx *ctx)
4364{
4365	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4366	struct r600_bytecode_alu alu;
4367	int r;
4368	int i;
4369
4370	/* result.x = 2^floor(src); */
4371	if (inst->Dst[0].Register.WriteMask & 1) {
4372		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4373
4374		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4375		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4376
4377		alu.dst.sel = ctx->temp_reg;
4378		alu.dst.chan = 0;
4379		alu.dst.write = 1;
4380		alu.last = 1;
4381		r = r600_bytecode_add_alu(ctx->bc, &alu);
4382		if (r)
4383			return r;
4384
4385		if (ctx->bc->chip_class == CAYMAN) {
4386			for (i = 0; i < 3; i++) {
4387				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4388				alu.src[0].sel = ctx->temp_reg;
4389				alu.src[0].chan = 0;
4390
4391				alu.dst.sel = ctx->temp_reg;
4392				alu.dst.chan = i;
4393				alu.dst.write = i == 0;
4394				alu.last = i == 2;
4395				r = r600_bytecode_add_alu(ctx->bc, &alu);
4396				if (r)
4397					return r;
4398			}
4399		} else {
4400			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4401			alu.src[0].sel = ctx->temp_reg;
4402			alu.src[0].chan = 0;
4403
4404			alu.dst.sel = ctx->temp_reg;
4405			alu.dst.chan = 0;
4406			alu.dst.write = 1;
4407			alu.last = 1;
4408			r = r600_bytecode_add_alu(ctx->bc, &alu);
4409			if (r)
4410				return r;
4411		}
4412	}
4413
4414	/* result.y = tmp - floor(tmp); */
4415	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4416		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4417
4418		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4419		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4420
4421		alu.dst.sel = ctx->temp_reg;
4422#if 0
4423		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4424		if (r)
4425			return r;
4426#endif
4427		alu.dst.write = 1;
4428		alu.dst.chan = 1;
4429
4430		alu.last = 1;
4431
4432		r = r600_bytecode_add_alu(ctx->bc, &alu);
4433		if (r)
4434			return r;
4435	}
4436
4437	/* result.z = RoughApprox2ToX(tmp);*/
4438	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4439		if (ctx->bc->chip_class == CAYMAN) {
4440			for (i = 0; i < 3; i++) {
4441				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4442				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4443				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4444
4445				alu.dst.sel = ctx->temp_reg;
4446				alu.dst.chan = i;
4447				if (i == 2) {
4448					alu.dst.write = 1;
4449					alu.last = 1;
4450				}
4451
4452				r = r600_bytecode_add_alu(ctx->bc, &alu);
4453				if (r)
4454					return r;
4455			}
4456		} else {
4457			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4458			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4459			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4460
4461			alu.dst.sel = ctx->temp_reg;
4462			alu.dst.write = 1;
4463			alu.dst.chan = 2;
4464
4465			alu.last = 1;
4466
4467			r = r600_bytecode_add_alu(ctx->bc, &alu);
4468			if (r)
4469				return r;
4470		}
4471	}
4472
4473	/* result.w = 1.0;*/
4474	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4475		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4476
4477		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4478		alu.src[0].sel = V_SQ_ALU_SRC_1;
4479		alu.src[0].chan = 0;
4480
4481		alu.dst.sel = ctx->temp_reg;
4482		alu.dst.chan = 3;
4483		alu.dst.write = 1;
4484		alu.last = 1;
4485		r = r600_bytecode_add_alu(ctx->bc, &alu);
4486		if (r)
4487			return r;
4488	}
4489	return tgsi_helper_copy(ctx, inst);
4490}
4491
4492static int tgsi_log(struct r600_shader_ctx *ctx)
4493{
4494	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4495	struct r600_bytecode_alu alu;
4496	int r;
4497	int i;
4498
4499	/* result.x = floor(log2(|src|)); */
4500	if (inst->Dst[0].Register.WriteMask & 1) {
4501		if (ctx->bc->chip_class == CAYMAN) {
4502			for (i = 0; i < 3; i++) {
4503				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4504
4505				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4506				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4507				r600_bytecode_src_set_abs(&alu.src[0]);
4508
4509				alu.dst.sel = ctx->temp_reg;
4510				alu.dst.chan = i;
4511				if (i == 0)
4512					alu.dst.write = 1;
4513				if (i == 2)
4514					alu.last = 1;
4515				r = r600_bytecode_add_alu(ctx->bc, &alu);
4516				if (r)
4517					return r;
4518			}
4519
4520		} else {
4521			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4522
4523			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4524			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4525			r600_bytecode_src_set_abs(&alu.src[0]);
4526
4527			alu.dst.sel = ctx->temp_reg;
4528			alu.dst.chan = 0;
4529			alu.dst.write = 1;
4530			alu.last = 1;
4531			r = r600_bytecode_add_alu(ctx->bc, &alu);
4532			if (r)
4533				return r;
4534		}
4535
4536		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4537		alu.src[0].sel = ctx->temp_reg;
4538		alu.src[0].chan = 0;
4539
4540		alu.dst.sel = ctx->temp_reg;
4541		alu.dst.chan = 0;
4542		alu.dst.write = 1;
4543		alu.last = 1;
4544
4545		r = r600_bytecode_add_alu(ctx->bc, &alu);
4546		if (r)
4547			return r;
4548	}
4549
4550	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4551	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4552
4553		if (ctx->bc->chip_class == CAYMAN) {
4554			for (i = 0; i < 3; i++) {
4555				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4556
4557				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4558				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4559				r600_bytecode_src_set_abs(&alu.src[0]);
4560
4561				alu.dst.sel = ctx->temp_reg;
4562				alu.dst.chan = i;
4563				if (i == 1)
4564					alu.dst.write = 1;
4565				if (i == 2)
4566					alu.last = 1;
4567
4568				r = r600_bytecode_add_alu(ctx->bc, &alu);
4569				if (r)
4570					return r;
4571			}
4572		} else {
4573			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4574
4575			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4576			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4577			r600_bytecode_src_set_abs(&alu.src[0]);
4578
4579			alu.dst.sel = ctx->temp_reg;
4580			alu.dst.chan = 1;
4581			alu.dst.write = 1;
4582			alu.last = 1;
4583
4584			r = r600_bytecode_add_alu(ctx->bc, &alu);
4585			if (r)
4586				return r;
4587		}
4588
4589		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4590
4591		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4592		alu.src[0].sel = ctx->temp_reg;
4593		alu.src[0].chan = 1;
4594
4595		alu.dst.sel = ctx->temp_reg;
4596		alu.dst.chan = 1;
4597		alu.dst.write = 1;
4598		alu.last = 1;
4599
4600		r = r600_bytecode_add_alu(ctx->bc, &alu);
4601		if (r)
4602			return r;
4603
4604		if (ctx->bc->chip_class == CAYMAN) {
4605			for (i = 0; i < 3; i++) {
4606				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4607				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4608				alu.src[0].sel = ctx->temp_reg;
4609				alu.src[0].chan = 1;
4610
4611				alu.dst.sel = ctx->temp_reg;
4612				alu.dst.chan = i;
4613				if (i == 1)
4614					alu.dst.write = 1;
4615				if (i == 2)
4616					alu.last = 1;
4617
4618				r = r600_bytecode_add_alu(ctx->bc, &alu);
4619				if (r)
4620					return r;
4621			}
4622		} else {
4623			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4624			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4625			alu.src[0].sel = ctx->temp_reg;
4626			alu.src[0].chan = 1;
4627
4628			alu.dst.sel = ctx->temp_reg;
4629			alu.dst.chan = 1;
4630			alu.dst.write = 1;
4631			alu.last = 1;
4632
4633			r = r600_bytecode_add_alu(ctx->bc, &alu);
4634			if (r)
4635				return r;
4636		}
4637
4638		if (ctx->bc->chip_class == CAYMAN) {
4639			for (i = 0; i < 3; i++) {
4640				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4641				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4642				alu.src[0].sel = ctx->temp_reg;
4643				alu.src[0].chan = 1;
4644
4645				alu.dst.sel = ctx->temp_reg;
4646				alu.dst.chan = i;
4647				if (i == 1)
4648					alu.dst.write = 1;
4649				if (i == 2)
4650					alu.last = 1;
4651
4652				r = r600_bytecode_add_alu(ctx->bc, &alu);
4653				if (r)
4654					return r;
4655			}
4656		} else {
4657			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4658			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4659			alu.src[0].sel = ctx->temp_reg;
4660			alu.src[0].chan = 1;
4661
4662			alu.dst.sel = ctx->temp_reg;
4663			alu.dst.chan = 1;
4664			alu.dst.write = 1;
4665			alu.last = 1;
4666
4667			r = r600_bytecode_add_alu(ctx->bc, &alu);
4668			if (r)
4669				return r;
4670		}
4671
4672		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4673
4674		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4675
4676		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4677		r600_bytecode_src_set_abs(&alu.src[0]);
4678
4679		alu.src[1].sel = ctx->temp_reg;
4680		alu.src[1].chan = 1;
4681
4682		alu.dst.sel = ctx->temp_reg;
4683		alu.dst.chan = 1;
4684		alu.dst.write = 1;
4685		alu.last = 1;
4686
4687		r = r600_bytecode_add_alu(ctx->bc, &alu);
4688		if (r)
4689			return r;
4690	}
4691
4692	/* result.z = log2(|src|);*/
4693	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4694		if (ctx->bc->chip_class == CAYMAN) {
4695			for (i = 0; i < 3; i++) {
4696				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4697
4698				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4699				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4700				r600_bytecode_src_set_abs(&alu.src[0]);
4701
4702				alu.dst.sel = ctx->temp_reg;
4703				if (i == 2)
4704					alu.dst.write = 1;
4705				alu.dst.chan = i;
4706				if (i == 2)
4707					alu.last = 1;
4708
4709				r = r600_bytecode_add_alu(ctx->bc, &alu);
4710				if (r)
4711					return r;
4712			}
4713		} else {
4714			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4715
4716			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4717			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4718			r600_bytecode_src_set_abs(&alu.src[0]);
4719
4720			alu.dst.sel = ctx->temp_reg;
4721			alu.dst.write = 1;
4722			alu.dst.chan = 2;
4723			alu.last = 1;
4724
4725			r = r600_bytecode_add_alu(ctx->bc, &alu);
4726			if (r)
4727				return r;
4728		}
4729	}
4730
4731	/* result.w = 1.0; */
4732	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4733		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4734
4735		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4736		alu.src[0].sel = V_SQ_ALU_SRC_1;
4737		alu.src[0].chan = 0;
4738
4739		alu.dst.sel = ctx->temp_reg;
4740		alu.dst.chan = 3;
4741		alu.dst.write = 1;
4742		alu.last = 1;
4743
4744		r = r600_bytecode_add_alu(ctx->bc, &alu);
4745		if (r)
4746			return r;
4747	}
4748
4749	return tgsi_helper_copy(ctx, inst);
4750}
4751
4752static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4753{
4754	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4755	struct r600_bytecode_alu alu;
4756	int r;
4757
4758	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4759
4760	switch (inst->Instruction.Opcode) {
4761	case TGSI_OPCODE_ARL:
4762		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4763		break;
4764	case TGSI_OPCODE_ARR:
4765		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4766		break;
4767	case TGSI_OPCODE_UARL:
4768		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4769		break;
4770	default:
4771		assert(0);
4772		return -1;
4773	}
4774
4775	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4776	alu.last = 1;
4777	alu.dst.sel = ctx->bc->ar_reg;
4778	alu.dst.write = 1;
4779	r = r600_bytecode_add_alu(ctx->bc, &alu);
4780	if (r)
4781		return r;
4782
4783	ctx->bc->ar_loaded = 0;
4784	return 0;
4785}
4786static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4787{
4788	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4789	struct r600_bytecode_alu alu;
4790	int r;
4791
4792	switch (inst->Instruction.Opcode) {
4793	case TGSI_OPCODE_ARL:
4794		memset(&alu, 0, sizeof(alu));
4795		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4796		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4797		alu.dst.sel = ctx->bc->ar_reg;
4798		alu.dst.write = 1;
4799		alu.last = 1;
4800
4801		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4802			return r;
4803
4804		memset(&alu, 0, sizeof(alu));
4805		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4806		alu.src[0].sel = ctx->bc->ar_reg;
4807		alu.dst.sel = ctx->bc->ar_reg;
4808		alu.dst.write = 1;
4809		alu.last = 1;
4810
4811		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4812			return r;
4813		break;
4814	case TGSI_OPCODE_ARR:
4815		memset(&alu, 0, sizeof(alu));
4816		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4817		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4818		alu.dst.sel = ctx->bc->ar_reg;
4819		alu.dst.write = 1;
4820		alu.last = 1;
4821
4822		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4823			return r;
4824		break;
4825	case TGSI_OPCODE_UARL:
4826		memset(&alu, 0, sizeof(alu));
4827		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4828		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4829		alu.dst.sel = ctx->bc->ar_reg;
4830		alu.dst.write = 1;
4831		alu.last = 1;
4832
4833		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4834			return r;
4835		break;
4836	default:
4837		assert(0);
4838		return -1;
4839	}
4840
4841	ctx->bc->ar_loaded = 0;
4842	return 0;
4843}
4844
4845static int tgsi_opdst(struct r600_shader_ctx *ctx)
4846{
4847	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4848	struct r600_bytecode_alu alu;
4849	int i, r = 0;
4850
4851	for (i = 0; i < 4; i++) {
4852		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4853
4854		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4855		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4856
4857		if (i == 0 || i == 3) {
4858			alu.src[0].sel = V_SQ_ALU_SRC_1;
4859		} else {
4860			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4861		}
4862
4863		if (i == 0 || i == 2) {
4864			alu.src[1].sel = V_SQ_ALU_SRC_1;
4865		} else {
4866			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4867		}
4868		if (i == 3)
4869			alu.last = 1;
4870		r = r600_bytecode_add_alu(ctx->bc, &alu);
4871		if (r)
4872			return r;
4873	}
4874	return 0;
4875}
4876
4877static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4878{
4879	struct r600_bytecode_alu alu;
4880	int r;
4881
4882	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4883	alu.inst = opcode;
4884	alu.execute_mask = 1;
4885	alu.update_pred = 1;
4886
4887	alu.dst.sel = ctx->temp_reg;
4888	alu.dst.write = 1;
4889	alu.dst.chan = 0;
4890
4891	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4892	alu.src[1].sel = V_SQ_ALU_SRC_0;
4893	alu.src[1].chan = 0;
4894
4895	alu.last = 1;
4896
4897	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4898	if (r)
4899		return r;
4900	return 0;
4901}
4902
4903static int pops(struct r600_shader_ctx *ctx, int pops)
4904{
4905	unsigned force_pop = ctx->bc->force_add_cf;
4906
4907	if (!force_pop) {
4908		int alu_pop = 3;
4909		if (ctx->bc->cf_last) {
4910			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4911				alu_pop = 0;
4912			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4913				alu_pop = 1;
4914		}
4915		alu_pop += pops;
4916		if (alu_pop == 1) {
4917			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4918			ctx->bc->force_add_cf = 1;
4919		} else if (alu_pop == 2) {
4920			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4921			ctx->bc->force_add_cf = 1;
4922		} else {
4923			force_pop = 1;
4924		}
4925	}
4926
4927	if (force_pop) {
4928		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4929		ctx->bc->cf_last->pop_count = pops;
4930		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4931	}
4932
4933	return 0;
4934}
4935
4936static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4937{
4938	switch(reason) {
4939	case FC_PUSH_VPM:
4940		ctx->bc->callstack[ctx->bc->call_sp].current--;
4941		break;
4942	case FC_PUSH_WQM:
4943	case FC_LOOP:
4944		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4945		break;
4946	case FC_REP:
4947		/* TOODO : for 16 vp asic should -= 2; */
4948		ctx->bc->callstack[ctx->bc->call_sp].current --;
4949		break;
4950	}
4951}
4952
4953static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4954{
4955	if (check_max_only) {
4956		int diff;
4957		switch (reason) {
4958		case FC_PUSH_VPM:
4959			diff = 1;
4960			break;
4961		case FC_PUSH_WQM:
4962			diff = 4;
4963			break;
4964		default:
4965			assert(0);
4966			diff = 0;
4967		}
4968		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4969		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4970			ctx->bc->callstack[ctx->bc->call_sp].max =
4971				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4972		}
4973		return;
4974	}
4975	switch (reason) {
4976	case FC_PUSH_VPM:
4977		ctx->bc->callstack[ctx->bc->call_sp].current++;
4978		break;
4979	case FC_PUSH_WQM:
4980	case FC_LOOP:
4981		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
4982		break;
4983	case FC_REP:
4984		ctx->bc->callstack[ctx->bc->call_sp].current++;
4985		break;
4986	}
4987
4988	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
4989	    ctx->bc->callstack[ctx->bc->call_sp].max) {
4990		ctx->bc->callstack[ctx->bc->call_sp].max =
4991			ctx->bc->callstack[ctx->bc->call_sp].current;
4992	}
4993}
4994
4995static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
4996{
4997	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
4998
4999	sp->mid = (struct r600_bytecode_cf **)realloc((void *)sp->mid,
5000						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
5001	sp->mid[sp->num_mid] = ctx->bc->cf_last;
5002	sp->num_mid++;
5003}
5004
5005static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
5006{
5007	ctx->bc->fc_sp++;
5008	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
5009	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
5010}
5011
5012static void fc_poplevel(struct r600_shader_ctx *ctx)
5013{
5014	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5015	if (sp->mid) {
5016		free(sp->mid);
5017		sp->mid = NULL;
5018	}
5019	sp->num_mid = 0;
5020	sp->start = NULL;
5021	sp->type = 0;
5022	ctx->bc->fc_sp--;
5023}
5024
5025#if 0
5026static int emit_return(struct r600_shader_ctx *ctx)
5027{
5028	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5029	return 0;
5030}
5031
5032static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5033{
5034
5035	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5036	ctx->bc->cf_last->pop_count = pops;
5037	/* XXX work out offset */
5038	return 0;
5039}
5040
5041static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5042{
5043	return 0;
5044}
5045
5046static void emit_testflag(struct r600_shader_ctx *ctx)
5047{
5048
5049}
5050
5051static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5052{
5053	emit_testflag(ctx);
5054	emit_jump_to_offset(ctx, 1, 4);
5055	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5056	pops(ctx, ifidx + 1);
5057	emit_return(ctx);
5058}
5059
5060static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5061{
5062	emit_testflag(ctx);
5063
5064	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5065	ctx->bc->cf_last->pop_count = 1;
5066
5067	fc_set_mid(ctx, fc_sp);
5068
5069	pops(ctx, 1);
5070}
5071#endif
5072
5073static int tgsi_if(struct r600_shader_ctx *ctx)
5074{
5075	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5076
5077	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5078
5079	fc_pushlevel(ctx, FC_IF);
5080
5081	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5082	return 0;
5083}
5084
5085static int tgsi_else(struct r600_shader_ctx *ctx)
5086{
5087	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5088	ctx->bc->cf_last->pop_count = 1;
5089
5090	fc_set_mid(ctx, ctx->bc->fc_sp);
5091	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5092	return 0;
5093}
5094
5095static int tgsi_endif(struct r600_shader_ctx *ctx)
5096{
5097	pops(ctx, 1);
5098	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5099		R600_ERR("if/endif unbalanced in shader\n");
5100		return -1;
5101	}
5102
5103	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5104		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5105		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5106	} else {
5107		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5108	}
5109	fc_poplevel(ctx);
5110
5111	callstack_decrease_current(ctx, FC_PUSH_VPM);
5112	return 0;
5113}
5114
5115static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5116{
5117	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL));
5118
5119	fc_pushlevel(ctx, FC_LOOP);
5120
5121	/* check stack depth */
5122	callstack_check_depth(ctx, FC_LOOP, 0);
5123	return 0;
5124}
5125
5126static int tgsi_endloop(struct r600_shader_ctx *ctx)
5127{
5128	int i;
5129
5130	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5131
5132	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5133		R600_ERR("loop/endloop in shader code are not paired.\n");
5134		return -EINVAL;
5135	}
5136
5137	/* fixup loop pointers - from r600isa
5138	   LOOP END points to CF after LOOP START,
5139	   LOOP START point to CF after LOOP END
5140	   BRK/CONT point to LOOP END CF
5141	*/
5142	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5143
5144	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5145
5146	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5147		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5148	}
5149	/* XXX add LOOPRET support */
5150	fc_poplevel(ctx);
5151	callstack_decrease_current(ctx, FC_LOOP);
5152	return 0;
5153}
5154
5155static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5156{
5157	unsigned int fscp;
5158
5159	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5160	{
5161		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5162			break;
5163	}
5164
5165	if (fscp == 0) {
5166		R600_ERR("Break not inside loop/endloop pair\n");
5167		return -EINVAL;
5168	}
5169
5170	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5171
5172	fc_set_mid(ctx, fscp);
5173
5174	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5175	return 0;
5176}
5177
5178static int tgsi_umad(struct r600_shader_ctx *ctx)
5179{
5180	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5181	struct r600_bytecode_alu alu;
5182	int i, j, r;
5183	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5184
5185	/* src0 * src1 */
5186	for (i = 0; i < lasti + 1; i++) {
5187		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5188			continue;
5189
5190		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5191
5192		alu.dst.chan = i;
5193		alu.dst.sel = ctx->temp_reg;
5194		alu.dst.write = 1;
5195
5196		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5197		for (j = 0; j < 2; j++) {
5198		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5199		}
5200
5201		alu.last = 1;
5202		r = r600_bytecode_add_alu(ctx->bc, &alu);
5203		if (r)
5204			return r;
5205	}
5206
5207
5208	for (i = 0; i < lasti + 1; i++) {
5209		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5210			continue;
5211
5212		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5213		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5214
5215		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5216
5217		alu.src[0].sel = ctx->temp_reg;
5218		alu.src[0].chan = i;
5219
5220		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5221		if (i == lasti) {
5222			alu.last = 1;
5223		}
5224		r = r600_bytecode_add_alu(ctx->bc, &alu);
5225		if (r)
5226			return r;
5227	}
5228	return 0;
5229}
5230
5231static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5232	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5233	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5234	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5235
5236	/* XXX:
5237	 * For state trackers other than OpenGL, we'll want to use
5238	 * _RECIP_IEEE instead.
5239	 */
5240	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5241
5242	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5243	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5244	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5245	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5246	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5247	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5248	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5249	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5250	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5251	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5252	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5253	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5254	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5255	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5256	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5257	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5258	/* gap */
5259	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5260	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5261	/* gap */
5262	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5263	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5264	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5265	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5266	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5267	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5268	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5269	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5270	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5271	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5272	/* gap */
5273	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5274	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5275	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5276	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5277	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5278	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5279	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5280	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5281	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5282	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5283	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5284	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5285	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5286	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5287	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5288	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5289	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5290	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5291	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5292	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5293	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5294	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5295	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5296	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5298	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5299	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5300	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5301	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5302	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5303	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5304	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5305	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5306	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5307	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5308	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5309	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5310	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5311	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5312	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5313	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5314	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5315	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5316	/* gap */
5317	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5318	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5319	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5320	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5321	/* gap */
5322	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5323	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5324	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5325	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5326	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5327	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5328	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5329	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5330	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5331	/* gap */
5332	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5333	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5334	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5335	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5336	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5337	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5338	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5339	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5340	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5341	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5342	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5343	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5344	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5345	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5346	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5347	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5348	/* gap */
5349	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5350	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5351	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5352	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5353	/* gap */
5354	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5355	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5356	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5357	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5358	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5359	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5360	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5361	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5362	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5363	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5364	/* gap */
5365	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5366	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5367	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5368	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5369	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5370	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5371	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5372	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5373	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5374	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
5375	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5376	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5377	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5378	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5379	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5380	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5381	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5382	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5383	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5384	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5385	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5386	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5387	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5388	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5389	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5390	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5391	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5392	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5393	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5394	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5395	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5396	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5397	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5398	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5399	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5400	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5401	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5402	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5403	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5404	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5405	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5406	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5407	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5408	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5409};
5410
5411static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5412	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5413	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5414	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5415	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5416	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5417	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5418	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5419	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5420	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5421	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5422	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5423	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5424	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5425	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5426	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5427	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5428	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5429	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5430	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5431	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5432	/* gap */
5433	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5434	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5435	/* gap */
5436	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5437	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5438	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5439	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5440	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5441	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5442	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5443	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5444	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5445	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5446	/* gap */
5447	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5448	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5449	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5450	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5451	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5452	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5453	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5454	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5455	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5456	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5457	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5458	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5459	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5460	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5461	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5462	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5463	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5464	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5465	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5466	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5467	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5468	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5469	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5470	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5472	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5473	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5474	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5475	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5476	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5477	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5478	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5479	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5480	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5481	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5482	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5483	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5484	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5485	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5486	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5487	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5488	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5489	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5490	/* gap */
5491	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5492	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5493	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5494	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5495	/* gap */
5496	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5498	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5499	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5500	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5501	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5502	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5503	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5504	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5505	/* gap */
5506	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5507	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5508	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5509	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5510	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5511	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5512	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5513	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5514	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5515	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5516	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5517	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5518	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5519	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5520	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5521	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5522	/* gap */
5523	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5524	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5525	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5526	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5527	/* gap */
5528	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5529	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5530	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5531	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5532	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5533	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5534	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5535	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5536	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5537	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5538	/* gap */
5539	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5540	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5541	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5542	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5543	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5544	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5545	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5546	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5547	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5548	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5549	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5550	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5551	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5552	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5553	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5554	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5555	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5556	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5557	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5558	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5559	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5560	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5561	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5562	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5563	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5564	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5565	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5566	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5567	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5568	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5569	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5570	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5571	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5572	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5573	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5574	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5575	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5576	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5577	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5578	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5579	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5580	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5581	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5582	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5583};
5584
5585static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5586	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5587	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5588	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5589	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5590	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5591	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5592	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5593	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5594	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5595	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5596	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5597	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5598	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5599	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5600	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5601	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5602	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5603	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5604	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5605	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5606	/* gap */
5607	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5608	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5609	/* gap */
5610	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5611	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5612	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5613	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5614	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5615	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5616	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5617	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5618	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5619	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5620	/* gap */
5621	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5622	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5623	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5624	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5625	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5626	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5627	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5628	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5629	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5630	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5631	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5632	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5633	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5634	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5635	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5636	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5637	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5638	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5639	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5640	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5641	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5642	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5643	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5644	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5646	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5647	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5648	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5649	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5650	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5651	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5652	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5653	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5654	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5655	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5656	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5657	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5658	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5659	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5660	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5661	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5662	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5663	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5664	/* gap */
5665	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5666	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5667	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5668	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5669	/* gap */
5670	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5672	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5673	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5674	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5675	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5676	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5677	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5678	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5679	/* gap */
5680	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5681	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5682	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5683	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5684	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5685	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5686	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5687	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5688	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5689	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5690	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5691	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5692	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5693	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5694	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5695	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5696	/* gap */
5697	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5698	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5699	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5700	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5701	/* gap */
5702	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5703	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5704	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5705	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5706	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5707	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5708	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5709	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5710	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5711	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5712	/* gap */
5713	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5714	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5715	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5716	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5717	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5718	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5719	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5720	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5721	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5722	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5723	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5724	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5725	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5726	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5727	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5728	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5729	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5730	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5731	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5732	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5733	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5734	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5735	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5736	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5737	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5738	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5739	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5740	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5741	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5742	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5743	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5744	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5745	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5746	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5747	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5748	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5749	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5750	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5751	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5752	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5753	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5754	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5755	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5756	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5757};
5758