r600_shader.c revision 8f597d57e959830040473b548e0e04cfc63866c2
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600d.h"
28
29#include "pipe/p_shader_tokens.h"
30#include "tgsi/tgsi_info.h"
31#include "tgsi/tgsi_parse.h"
32#include "tgsi/tgsi_scan.h"
33#include "tgsi/tgsi_dump.h"
34#include "util/u_memory.h"
35#include <stdio.h>
36#include <errno.h>
37#include <byteswap.h>
38
39/* CAYMAN notes
40Why CAYMAN got loops for lots of instructions is explained here.
41
42-These 8xx t-slot only ops are implemented in all vector slots.
43MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
44These 8xx t-slot only opcodes become vector ops, with all four
45slots expecting the arguments on sources a and b. Result is
46broadcast to all channels.
47MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
48These 8xx t-slot only opcodes become vector ops in the z, y, and
49x slots.
50EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
51RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
52SQRT_IEEE/_64
53SIN/COS
54The w slot may have an independent co-issued operation, or if the
55result is required to be in the w slot, the opcode above may be
56issued in the w slot as well.
57The compiler must issue the source argument to slots z, y, and x
58*/
59
60static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
61{
62	struct r600_context *rctx = (struct r600_context *)ctx;
63	struct r600_shader *rshader = &shader->shader;
64	uint32_t *ptr;
65	int	i;
66
67	/* copy new shader */
68	if (shader->bo == NULL) {
69		shader->bo = (struct r600_resource*)
70			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
71		if (shader->bo == NULL) {
72			return -ENOMEM;
73		}
74		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
75		if (R600_BIG_ENDIAN) {
76			for (i = 0; i < rshader->bc.ndw; ++i) {
77				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
78			}
79		} else {
80			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
81		}
82		rctx->ws->buffer_unmap(shader->bo->cs_buf);
83	}
84	/* build state */
85	switch (rshader->processor_type) {
86	case TGSI_PROCESSOR_VERTEX:
87		if (rctx->chip_class >= EVERGREEN) {
88			evergreen_pipe_shader_vs(ctx, shader);
89		} else {
90			r600_pipe_shader_vs(ctx, shader);
91		}
92		break;
93	case TGSI_PROCESSOR_FRAGMENT:
94		if (rctx->chip_class >= EVERGREEN) {
95			evergreen_pipe_shader_ps(ctx, shader);
96		} else {
97			r600_pipe_shader_ps(ctx, shader);
98		}
99		break;
100	default:
101		return -EINVAL;
102	}
103	return 0;
104}
105
106static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader);
107
108int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader)
109{
110	static int dump_shaders = -1;
111	struct r600_context *rctx = (struct r600_context *)ctx;
112	struct r600_pipe_shader_selector *sel = shader->selector;
113	int r;
114
115	/* Would like some magic "get_bool_option_once" routine.
116	*/
117	if (dump_shaders == -1)
118		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
119
120	if (dump_shaders) {
121		fprintf(stderr, "--------------------------------------------------------------\n");
122		tgsi_dump(sel->tokens, 0);
123
124		if (sel->so.num_outputs) {
125			unsigned i;
126			fprintf(stderr, "STREAMOUT\n");
127			for (i = 0; i < sel->so.num_outputs; i++) {
128				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
129						sel->so.output[i].start_component;
130				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
131					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
132				        mask & 1 ? "x" : "_",
133				        (mask >> 1) & 1 ? "y" : "_",
134				        (mask >> 2) & 1 ? "z" : "_",
135				        (mask >> 3) & 1 ? "w" : "_");
136			}
137		}
138	}
139	r = r600_shader_from_tgsi(rctx, shader);
140	if (r) {
141		R600_ERR("translation from TGSI failed !\n");
142		return r;
143	}
144	r = r600_bytecode_build(&shader->shader.bc);
145	if (r) {
146		R600_ERR("building bytecode failed !\n");
147		return r;
148	}
149	if (dump_shaders) {
150		r600_bytecode_dump(&shader->shader.bc);
151		fprintf(stderr, "______________________________________________________________\n");
152	}
153	return r600_pipe_shader(ctx, shader);
154}
155
156void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
157{
158	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
159	r600_bytecode_clear(&shader->shader.bc);
160}
161
162/*
163 * tgsi -> r600 shader
164 */
165struct r600_shader_tgsi_instruction;
166
167struct r600_shader_src {
168	unsigned				sel;
169	unsigned				swizzle[4];
170	unsigned				neg;
171	unsigned				abs;
172	unsigned				rel;
173	uint32_t				value[4];
174};
175
176struct r600_shader_ctx {
177	struct tgsi_shader_info			info;
178	struct tgsi_parse_context		parse;
179	const struct tgsi_token			*tokens;
180	unsigned				type;
181	unsigned				file_offset[TGSI_FILE_COUNT];
182	unsigned				temp_reg;
183	struct r600_shader_tgsi_instruction	*inst_info;
184	struct r600_bytecode			*bc;
185	struct r600_shader			*shader;
186	struct r600_shader_src			src[4];
187	uint32_t				*literals;
188	uint32_t				nliterals;
189	uint32_t				max_driver_temp_used;
190	/* needed for evergreen interpolation */
191	boolean                                 input_centroid;
192	boolean                                 input_linear;
193	boolean                                 input_perspective;
194	int					num_interp_gpr;
195	int					face_gpr;
196	int					colors_used;
197	boolean                 clip_vertex_write;
198	unsigned                cv_output;
199	int					fragcoord_input;
200	int					native_integers;
201};
202
203struct r600_shader_tgsi_instruction {
204	unsigned	tgsi_opcode;
205	unsigned	is_op3;
206	unsigned	r600_opcode;
207	int (*process)(struct r600_shader_ctx *ctx);
208};
209
210static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
211static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
212static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
213static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
214static int tgsi_else(struct r600_shader_ctx *ctx);
215static int tgsi_endif(struct r600_shader_ctx *ctx);
216static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
217static int tgsi_endloop(struct r600_shader_ctx *ctx);
218static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
219
220/*
221 * bytestream -> r600 shader
222 *
223 * These functions are used to transform the output of the LLVM backend into
224 * struct r600_bytecode.
225 */
226
227static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
228				unsigned char * bytes,	unsigned num_bytes);
229
230#ifdef HAVE_OPENCL
231int r600_compute_shader_create(struct pipe_context * ctx,
232	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
233{
234	struct r600_context *r600_ctx = (struct r600_context *)ctx;
235	unsigned char * bytes;
236	unsigned byte_count;
237	struct r600_shader_ctx shader_ctx;
238	unsigned dump = 0;
239
240	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
241		dump = 1;
242	}
243
244	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
245	shader_ctx.bc = bytecode;
246	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
247	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
248	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
249	if (shader_ctx.bc->chip_class == CAYMAN) {
250		cm_bytecode_add_cf_end(shader_ctx.bc);
251	}
252	r600_bytecode_build(shader_ctx.bc);
253	if (dump) {
254		r600_bytecode_dump(shader_ctx.bc);
255	}
256	return 1;
257}
258
259#endif /* HAVE_OPENCL */
260
261static uint32_t i32_from_byte_stream(unsigned char * bytes,
262		unsigned * bytes_read)
263{
264	unsigned i;
265	uint32_t out = 0;
266	for (i = 0; i < 4; i++) {
267		out |= bytes[(*bytes_read)++] << (8 * i);
268	}
269	return out;
270}
271
272static unsigned r600_src_from_byte_stream(unsigned char * bytes,
273		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
274{
275	unsigned i;
276	unsigned sel0, sel1;
277	sel0 = bytes[bytes_read++];
278	sel1 = bytes[bytes_read++];
279	alu->src[src_idx].sel = sel0 | (sel1 << 8);
280	alu->src[src_idx].chan = bytes[bytes_read++];
281	alu->src[src_idx].neg = bytes[bytes_read++];
282	alu->src[src_idx].abs = bytes[bytes_read++];
283	alu->src[src_idx].rel = bytes[bytes_read++];
284	alu->src[src_idx].kc_bank = bytes[bytes_read++];
285	for (i = 0; i < 4; i++) {
286		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
287	}
288	return bytes_read;
289}
290
291static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
292				unsigned char * bytes, unsigned bytes_read)
293{
294	unsigned src_idx;
295	unsigned inst0, inst1;
296	unsigned push_modifier;
297	struct r600_bytecode_alu alu;
298	memset(&alu, 0, sizeof(alu));
299	for(src_idx = 0; src_idx < 3; src_idx++) {
300		bytes_read = r600_src_from_byte_stream(bytes, bytes_read,
301								&alu, src_idx);
302	}
303
304	alu.dst.sel = bytes[bytes_read++];
305	alu.dst.chan = bytes[bytes_read++];
306	alu.dst.clamp = bytes[bytes_read++];
307	alu.dst.write = bytes[bytes_read++];
308	alu.dst.rel = bytes[bytes_read++];
309	inst0 = bytes[bytes_read++];
310	inst1 = bytes[bytes_read++];
311	alu.inst = inst0 | (inst1 << 8);
312	alu.last = bytes[bytes_read++];
313	alu.is_op3 = bytes[bytes_read++];
314	push_modifier = bytes[bytes_read++];
315	alu.pred_sel = bytes[bytes_read++];
316	alu.bank_swizzle = bytes[bytes_read++];
317	alu.bank_swizzle_force = bytes[bytes_read++];
318	alu.omod = bytes[bytes_read++];
319	alu.index_mode = bytes[bytes_read++];
320
321
322	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
323	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
324	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
325	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
326		alu.update_pred = 1;
327		alu.dst.write = 0;
328		alu.src[1].sel = V_SQ_ALU_SRC_0;
329		alu.src[1].chan = 0;
330		alu.last = 1;
331    }
332
333    if (push_modifier) {
334        alu.pred_sel = 0;
335		alu.execute_mask = 1;
336		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
337	} else
338		r600_bytecode_add_alu(ctx->bc, &alu);
339
340
341	/* XXX: Handle other KILL instructions */
342	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
343		ctx->shader->uses_kill = 1;
344		/* XXX: This should be enforced in the LLVM backend. */
345		ctx->bc->force_add_cf = 1;
346	}
347	return bytes_read;
348}
349
350static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
351	unsigned pred_inst)
352{
353	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
354	fc_pushlevel(ctx, FC_IF);
355	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
356}
357
358static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
359			struct r600_bytecode_alu *alu, unsigned compare_opcode)
360{
361	unsigned opcode = TGSI_OPCODE_BRK;
362	if (ctx->bc->chip_class == CAYMAN)
363		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
364	else if (ctx->bc->chip_class >= EVERGREEN)
365		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
366	else
367		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
368	llvm_if(ctx, alu, compare_opcode);
369	tgsi_loop_brk_cont(ctx);
370	tgsi_endif(ctx);
371}
372
373static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
374				unsigned char * bytes, unsigned bytes_read)
375{
376	struct r600_bytecode_alu alu;
377	unsigned inst;
378	memset(&alu, 0, sizeof(alu));
379	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
380	inst = bytes[bytes_read++];
381	switch (inst) {
382	case 0: /* FC_IF */
383		llvm_if(ctx, &alu,
384			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
385		break;
386	case 1: /* FC_IF_INT */
387		llvm_if(ctx, &alu,
388			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
389		break;
390	case 2: /* FC_ELSE */
391		tgsi_else(ctx);
392		break;
393	case 3: /* FC_ENDIF */
394		tgsi_endif(ctx);
395		break;
396	case 4: /* FC_BGNLOOP */
397		tgsi_bgnloop(ctx);
398		break;
399	case 5: /* FC_ENDLOOP */
400		tgsi_endloop(ctx);
401		break;
402	case 6: /* FC_BREAK */
403		r600_break_from_byte_stream(ctx, &alu,
404			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
405		break;
406	case 7: /* FC_BREAK_NZ_INT */
407		r600_break_from_byte_stream(ctx, &alu,
408			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
409		break;
410	case 8: /* FC_CONTINUE */
411		{
412			unsigned opcode = TGSI_OPCODE_CONT;
413			if (ctx->bc->chip_class == CAYMAN) {
414				ctx->inst_info =
415					&cm_shader_tgsi_instruction[opcode];
416			} else if (ctx->bc->chip_class >= EVERGREEN) {
417				ctx->inst_info =
418					&eg_shader_tgsi_instruction[opcode];
419			} else {
420				ctx->inst_info =
421					&r600_shader_tgsi_instruction[opcode];
422			}
423			tgsi_loop_brk_cont(ctx);
424		}
425		break;
426	case 9: /* FC_BREAK_Z_INT */
427		r600_break_from_byte_stream(ctx, &alu,
428			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
429		break;
430	case 10: /* FC_BREAK_NZ */
431		r600_break_from_byte_stream(ctx, &alu,
432			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
433		break;
434	}
435
436	return bytes_read;
437}
438
439static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
440				unsigned char * bytes, unsigned bytes_read)
441{
442	struct r600_bytecode_tex tex;
443
444	tex.inst = bytes[bytes_read++];
445	tex.resource_id = bytes[bytes_read++];
446	tex.src_gpr = bytes[bytes_read++];
447	tex.src_rel = bytes[bytes_read++];
448	tex.dst_gpr = bytes[bytes_read++];
449	tex.dst_rel = bytes[bytes_read++];
450	tex.dst_sel_x = bytes[bytes_read++];
451	tex.dst_sel_y = bytes[bytes_read++];
452	tex.dst_sel_z = bytes[bytes_read++];
453	tex.dst_sel_w = bytes[bytes_read++];
454	tex.lod_bias = bytes[bytes_read++];
455	tex.coord_type_x = bytes[bytes_read++];
456	tex.coord_type_y = bytes[bytes_read++];
457	tex.coord_type_z = bytes[bytes_read++];
458	tex.coord_type_w = bytes[bytes_read++];
459	tex.offset_x = bytes[bytes_read++];
460	tex.offset_y = bytes[bytes_read++];
461	tex.offset_z = bytes[bytes_read++];
462	tex.sampler_id = bytes[bytes_read++];
463	tex.src_sel_x = bytes[bytes_read++];
464	tex.src_sel_y = bytes[bytes_read++];
465	tex.src_sel_z = bytes[bytes_read++];
466	tex.src_sel_w = bytes[bytes_read++];
467
468	r600_bytecode_add_tex(ctx->bc, &tex);
469
470	return bytes_read;
471}
472
473static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
474	unsigned char * bytes, unsigned bytes_read)
475{
476	struct r600_bytecode_vtx vtx;
477
478	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
479        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
480	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
481
482	memset(&vtx, 0, sizeof(vtx));
483
484	/* WORD0 */
485	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
486	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
487	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
488	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
489	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
490	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
491
492	/* WORD1 */
493	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
494	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
495	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
496	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
497	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
498	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
499	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
500	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
501	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
502	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
503
504	/* WORD 2*/
505	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
506	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
507
508	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
509		fprintf(stderr, "Error adding vtx\n");
510	}
511	/* Use the Texture Cache */
512	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
513	return bytes_read;
514}
515
516static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
517				unsigned char * bytes,	unsigned num_bytes)
518{
519	unsigned bytes_read = 0;
520	unsigned i, byte;
521	while (bytes_read < num_bytes) {
522		char inst_type = bytes[bytes_read++];
523		switch (inst_type) {
524		case 0:
525			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
526								bytes_read);
527			break;
528		case 1:
529			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
530								bytes_read);
531			break;
532		case 2:
533			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
534								bytes_read);
535			break;
536		case 3:
537			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
538			for (i = 0; i < 2; i++) {
539				for (byte = 0 ; byte < 4; byte++) {
540					ctx->bc->cf_last->isa[i] |=
541					(bytes[bytes_read++] << (byte * 8));
542				}
543			}
544			break;
545
546		case 4:
547			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
548								bytes_read);
549			break;
550		default:
551			/* XXX: Error here */
552			break;
553		}
554	}
555}
556
557/* End bytestream -> r600 shader functions*/
558
559static int tgsi_is_supported(struct r600_shader_ctx *ctx)
560{
561	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
562	int j;
563
564	if (i->Instruction.NumDstRegs > 1) {
565		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
566		return -EINVAL;
567	}
568	if (i->Instruction.Predicate) {
569		R600_ERR("predicate unsupported\n");
570		return -EINVAL;
571	}
572#if 0
573	if (i->Instruction.Label) {
574		R600_ERR("label unsupported\n");
575		return -EINVAL;
576	}
577#endif
578	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
579		if (i->Src[j].Register.Dimension) {
580			R600_ERR("unsupported src %d (dimension %d)\n", j,
581				 i->Src[j].Register.Dimension);
582			return -EINVAL;
583		}
584	}
585	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
586		if (i->Dst[j].Register.Dimension) {
587			R600_ERR("unsupported dst (dimension)\n");
588			return -EINVAL;
589		}
590	}
591	return 0;
592}
593
594static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
595{
596	int i, r;
597	struct r600_bytecode_alu alu;
598	int gpr = 0, base_chan = 0;
599	int ij_index = 0;
600
601	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
602		ij_index = 0;
603		if (ctx->shader->input[input].centroid)
604			ij_index++;
605	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
606		ij_index = 0;
607		/* if we have perspective add one */
608		if (ctx->input_perspective)  {
609			ij_index++;
610			/* if we have perspective centroid */
611			if (ctx->input_centroid)
612				ij_index++;
613		}
614		if (ctx->shader->input[input].centroid)
615			ij_index++;
616	}
617
618	/* work out gpr and base_chan from index */
619	gpr = ij_index / 2;
620	base_chan = (2 * (ij_index % 2)) + 1;
621
622	for (i = 0; i < 8; i++) {
623		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
624
625		if (i < 4)
626			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
627		else
628			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
629
630		if ((i > 1) && (i < 6)) {
631			alu.dst.sel = ctx->shader->input[input].gpr;
632			alu.dst.write = 1;
633		}
634
635		alu.dst.chan = i % 4;
636
637		alu.src[0].sel = gpr;
638		alu.src[0].chan = (base_chan - (i % 2));
639
640		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
641
642		alu.bank_swizzle_force = SQ_ALU_VEC_210;
643		if ((i % 4) == 3)
644			alu.last = 1;
645		r = r600_bytecode_add_alu(ctx->bc, &alu);
646		if (r)
647			return r;
648	}
649	return 0;
650}
651
652static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
653{
654	int i, r;
655	struct r600_bytecode_alu alu;
656
657	for (i = 0; i < 4; i++) {
658		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
659
660		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
661
662		alu.dst.sel = ctx->shader->input[input].gpr;
663		alu.dst.write = 1;
664
665		alu.dst.chan = i;
666
667		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
668		alu.src[0].chan = i;
669
670		if (i == 3)
671			alu.last = 1;
672		r = r600_bytecode_add_alu(ctx->bc, &alu);
673		if (r)
674			return r;
675	}
676	return 0;
677}
678
679/*
680 * Special export handling in shaders
681 *
682 * shader export ARRAY_BASE for EXPORT_POS:
683 * 60 is position
684 * 61 is misc vector
685 * 62, 63 are clip distance vectors
686 *
687 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
688 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
689 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
690 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
691 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
692 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
693 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
694 * exclusive from render target index)
695 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
696 *
697 *
698 * shader export ARRAY_BASE for EXPORT_PIXEL:
699 * 0-7 CB targets
700 * 61 computed Z vector
701 *
702 * The use of the values exported in the computed Z vector are controlled
703 * by DB_SHADER_CONTROL:
704 * Z_EXPORT_ENABLE - Z as a float in RED
705 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
706 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
707 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
708 * DB_SOURCE_FORMAT - export control restrictions
709 *
710 */
711
712
713/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
714static int r600_spi_sid(struct r600_shader_io * io)
715{
716	int index, name = io->name;
717
718	/* These params are handled differently, they don't need
719	 * semantic indices, so we'll use 0 for them.
720	 */
721	if (name == TGSI_SEMANTIC_POSITION ||
722		name == TGSI_SEMANTIC_PSIZE ||
723		name == TGSI_SEMANTIC_FACE)
724		index = 0;
725	else {
726		if (name == TGSI_SEMANTIC_GENERIC) {
727			/* For generic params simply use sid from tgsi */
728			index = io->sid;
729		} else {
730			/* For non-generic params - pack name and sid into 8 bits */
731			index = 0x80 | (name<<3) | (io->sid);
732		}
733
734		/* Make sure that all really used indices have nonzero value, so
735		 * we can just compare it to 0 later instead of comparing the name
736		 * with different values to detect special cases. */
737		index++;
738	}
739
740	return index;
741};
742
743/* turn input into interpolate on EG */
744static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
745{
746	int r = 0;
747
748	if (ctx->shader->input[index].spi_sid) {
749		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
750		if (ctx->shader->input[index].interpolate > 0) {
751			r = evergreen_interp_alu(ctx, index);
752		} else {
753			r = evergreen_interp_flat(ctx, index);
754		}
755	}
756	return r;
757}
758
759static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
760{
761	struct r600_bytecode_alu alu;
762	int i, r;
763	int gpr_front = ctx->shader->input[front].gpr;
764	int gpr_back = ctx->shader->input[back].gpr;
765
766	for (i = 0; i < 4; i++) {
767		memset(&alu, 0, sizeof(alu));
768		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
769		alu.is_op3 = 1;
770		alu.dst.write = 1;
771		alu.dst.sel = gpr_front;
772		alu.src[0].sel = ctx->face_gpr;
773		alu.src[1].sel = gpr_front;
774		alu.src[2].sel = gpr_back;
775
776		alu.dst.chan = i;
777		alu.src[1].chan = i;
778		alu.src[2].chan = i;
779		alu.last = (i==3);
780
781		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
782			return r;
783	}
784
785	return 0;
786}
787
788static int tgsi_declaration(struct r600_shader_ctx *ctx)
789{
790	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
791	unsigned i;
792	int r;
793
794	switch (d->Declaration.File) {
795	case TGSI_FILE_INPUT:
796		i = ctx->shader->ninput++;
797		ctx->shader->input[i].name = d->Semantic.Name;
798		ctx->shader->input[i].sid = d->Semantic.Index;
799		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
800		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
801		ctx->shader->input[i].centroid = d->Interp.Centroid;
802		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
803		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
804			switch (ctx->shader->input[i].name) {
805			case TGSI_SEMANTIC_FACE:
806				ctx->face_gpr = ctx->shader->input[i].gpr;
807				break;
808			case TGSI_SEMANTIC_COLOR:
809				ctx->colors_used++;
810				break;
811			case TGSI_SEMANTIC_POSITION:
812				ctx->fragcoord_input = i;
813				break;
814			}
815			if (ctx->bc->chip_class >= EVERGREEN) {
816				if ((r = evergreen_interp_input(ctx, i)))
817					return r;
818			}
819		}
820		break;
821	case TGSI_FILE_OUTPUT:
822		i = ctx->shader->noutput++;
823		ctx->shader->output[i].name = d->Semantic.Name;
824		ctx->shader->output[i].sid = d->Semantic.Index;
825		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
826		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
827		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
828		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
829		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
830			switch (d->Semantic.Name) {
831			case TGSI_SEMANTIC_CLIPDIST:
832				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
833				break;
834			case TGSI_SEMANTIC_PSIZE:
835				ctx->shader->vs_out_misc_write = 1;
836				ctx->shader->vs_out_point_size = 1;
837				break;
838			case TGSI_SEMANTIC_CLIPVERTEX:
839				ctx->clip_vertex_write = TRUE;
840				ctx->cv_output = i;
841				break;
842			}
843		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
844			switch (d->Semantic.Name) {
845			case TGSI_SEMANTIC_COLOR:
846				ctx->shader->nr_ps_max_color_exports++;
847				break;
848			}
849		}
850		break;
851	case TGSI_FILE_CONSTANT:
852	case TGSI_FILE_TEMPORARY:
853	case TGSI_FILE_SAMPLER:
854	case TGSI_FILE_ADDRESS:
855		break;
856
857	case TGSI_FILE_SYSTEM_VALUE:
858		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
859			if (!ctx->native_integers) {
860				struct r600_bytecode_alu alu;
861				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
862
863				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
864				alu.src[0].sel = 0;
865				alu.src[0].chan = 3;
866
867				alu.dst.sel = 0;
868				alu.dst.chan = 3;
869				alu.dst.write = 1;
870				alu.last = 1;
871
872				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
873					return r;
874			}
875			break;
876		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
877			break;
878	default:
879		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
880		return -EINVAL;
881	}
882	return 0;
883}
884
885static int r600_get_temp(struct r600_shader_ctx *ctx)
886{
887	return ctx->temp_reg + ctx->max_driver_temp_used++;
888}
889
890/*
891 * for evergreen we need to scan the shader to find the number of GPRs we need to
892 * reserve for interpolation.
893 *
894 * we need to know if we are going to emit
895 * any centroid inputs
896 * if perspective and linear are required
897*/
898static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
899{
900	int i;
901	int num_baryc;
902
903	ctx->input_linear = FALSE;
904	ctx->input_perspective = FALSE;
905	ctx->input_centroid = FALSE;
906	ctx->num_interp_gpr = 1;
907
908	/* any centroid inputs */
909	for (i = 0; i < ctx->info.num_inputs; i++) {
910		/* skip position/face */
911		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
912		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
913			continue;
914		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
915			ctx->input_linear = TRUE;
916		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
917			ctx->input_perspective = TRUE;
918		if (ctx->info.input_centroid[i])
919			ctx->input_centroid = TRUE;
920	}
921
922	num_baryc = 0;
923	/* ignoring sample for now */
924	if (ctx->input_perspective)
925		num_baryc++;
926	if (ctx->input_linear)
927		num_baryc++;
928	if (ctx->input_centroid)
929		num_baryc *= 2;
930
931	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
932
933	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
934	return ctx->num_interp_gpr;
935}
936
937static void tgsi_src(struct r600_shader_ctx *ctx,
938		     const struct tgsi_full_src_register *tgsi_src,
939		     struct r600_shader_src *r600_src)
940{
941	memset(r600_src, 0, sizeof(*r600_src));
942	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
943	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
944	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
945	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
946	r600_src->neg = tgsi_src->Register.Negate;
947	r600_src->abs = tgsi_src->Register.Absolute;
948
949	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
950		int index;
951		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
952			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
953			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
954
955			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
956			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
957			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
958				return;
959		}
960		index = tgsi_src->Register.Index;
961		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
962		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
963	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
964		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
965			r600_src->swizzle[0] = 3;
966			r600_src->swizzle[1] = 3;
967			r600_src->swizzle[2] = 3;
968			r600_src->swizzle[3] = 3;
969			r600_src->sel = 0;
970		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
971			r600_src->swizzle[0] = 0;
972			r600_src->swizzle[1] = 0;
973			r600_src->swizzle[2] = 0;
974			r600_src->swizzle[3] = 0;
975			r600_src->sel = 0;
976		}
977	} else {
978		if (tgsi_src->Register.Indirect)
979			r600_src->rel = V_SQ_REL_RELATIVE;
980		r600_src->sel = tgsi_src->Register.Index;
981		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
982	}
983}
984
985static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
986{
987	struct r600_bytecode_vtx vtx;
988	unsigned int ar_reg;
989	int r;
990
991	if (offset) {
992		struct r600_bytecode_alu alu;
993
994		memset(&alu, 0, sizeof(alu));
995
996		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
997		alu.src[0].sel = ctx->bc->ar_reg;
998
999		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1000		alu.src[1].value = offset;
1001
1002		alu.dst.sel = dst_reg;
1003		alu.dst.write = 1;
1004		alu.last = 1;
1005
1006		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1007			return r;
1008
1009		ar_reg = dst_reg;
1010	} else {
1011		ar_reg = ctx->bc->ar_reg;
1012	}
1013
1014	memset(&vtx, 0, sizeof(vtx));
1015	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
1016	vtx.src_gpr = ar_reg;
1017	vtx.mega_fetch_count = 16;
1018	vtx.dst_gpr = dst_reg;
1019	vtx.dst_sel_x = 0;		/* SEL_X */
1020	vtx.dst_sel_y = 1;		/* SEL_Y */
1021	vtx.dst_sel_z = 2;		/* SEL_Z */
1022	vtx.dst_sel_w = 3;		/* SEL_W */
1023	vtx.data_format = FMT_32_32_32_32_FLOAT;
1024	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1025	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1026	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
1027	vtx.endian = r600_endian_swap(32);
1028
1029	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1030		return r;
1031
1032	return 0;
1033}
1034
1035static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1036{
1037	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1038	struct r600_bytecode_alu alu;
1039	int i, j, k, nconst, r;
1040
1041	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1042		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1043			nconst++;
1044		}
1045		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1046	}
1047	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1048		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1049			continue;
1050		}
1051
1052		if (ctx->src[i].rel) {
1053			int treg = r600_get_temp(ctx);
1054			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
1055				return r;
1056
1057			ctx->src[i].sel = treg;
1058			ctx->src[i].rel = 0;
1059			j--;
1060		} else if (j > 0) {
1061			int treg = r600_get_temp(ctx);
1062			for (k = 0; k < 4; k++) {
1063				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1064				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1065				alu.src[0].sel = ctx->src[i].sel;
1066				alu.src[0].chan = k;
1067				alu.src[0].rel = ctx->src[i].rel;
1068				alu.dst.sel = treg;
1069				alu.dst.chan = k;
1070				alu.dst.write = 1;
1071				if (k == 3)
1072					alu.last = 1;
1073				r = r600_bytecode_add_alu(ctx->bc, &alu);
1074				if (r)
1075					return r;
1076			}
1077			ctx->src[i].sel = treg;
1078			ctx->src[i].rel =0;
1079			j--;
1080		}
1081	}
1082	return 0;
1083}
1084
1085/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1086static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1087{
1088	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1089	struct r600_bytecode_alu alu;
1090	int i, j, k, nliteral, r;
1091
1092	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1093		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1094			nliteral++;
1095		}
1096	}
1097	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1098		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1099			int treg = r600_get_temp(ctx);
1100			for (k = 0; k < 4; k++) {
1101				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1102				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
1103				alu.src[0].sel = ctx->src[i].sel;
1104				alu.src[0].chan = k;
1105				alu.src[0].value = ctx->src[i].value[k];
1106				alu.dst.sel = treg;
1107				alu.dst.chan = k;
1108				alu.dst.write = 1;
1109				if (k == 3)
1110					alu.last = 1;
1111				r = r600_bytecode_add_alu(ctx->bc, &alu);
1112				if (r)
1113					return r;
1114			}
1115			ctx->src[i].sel = treg;
1116			j--;
1117		}
1118	}
1119	return 0;
1120}
1121
1122static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1123{
1124	int i, r, count = ctx->shader->ninput;
1125
1126	/* additional inputs will be allocated right after the existing inputs,
1127	 * we won't need them after the color selection, so we don't need to
1128	 * reserve these gprs for the rest of the shader code and to adjust
1129	 * output offsets etc. */
1130	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
1131			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
1132
1133	if (ctx->face_gpr == -1) {
1134		i = ctx->shader->ninput++;
1135		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
1136		ctx->shader->input[i].spi_sid = 0;
1137		ctx->shader->input[i].gpr = gpr++;
1138		ctx->face_gpr = ctx->shader->input[i].gpr;
1139	}
1140
1141	for (i = 0; i < count; i++) {
1142		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1143			int ni = ctx->shader->ninput++;
1144			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
1145			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1146			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
1147			ctx->shader->input[ni].gpr = gpr++;
1148
1149			if (ctx->bc->chip_class >= EVERGREEN) {
1150				r = evergreen_interp_input(ctx, ni);
1151				if (r)
1152					return r;
1153			}
1154
1155			r = select_twoside_color(ctx, i, ni);
1156			if (r)
1157				return r;
1158		}
1159	}
1160	return 0;
1161}
1162
1163static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader)
1164{
1165	struct r600_shader *shader = &pipeshader->shader;
1166	struct tgsi_token *tokens = pipeshader->selector->tokens;
1167	struct pipe_stream_output_info so = pipeshader->selector->so;
1168	struct tgsi_full_immediate *immediate;
1169	struct tgsi_full_property *property;
1170	struct r600_shader_ctx ctx;
1171	struct r600_bytecode_output output[32];
1172	unsigned output_done, noutput;
1173	unsigned opcode;
1174	int i, j, k, r = 0;
1175	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
1176	/* Declarations used by llvm code */
1177	bool use_llvm = false;
1178	unsigned char * inst_bytes = NULL;
1179	unsigned inst_byte_count = 0;
1180
1181#ifdef R600_USE_LLVM
1182	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
1183#endif
1184	ctx.bc = &shader->bc;
1185	ctx.shader = shader;
1186	ctx.native_integers = true;
1187
1188	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
1189	ctx.tokens = tokens;
1190	tgsi_scan_shader(tokens, &ctx.info);
1191	tgsi_parse_init(&ctx.parse, tokens);
1192	ctx.type = ctx.parse.FullHeader.Processor.Processor;
1193	shader->processor_type = ctx.type;
1194	ctx.bc->type = shader->processor_type;
1195
1196	ctx.face_gpr = -1;
1197	ctx.fragcoord_input = -1;
1198	ctx.colors_used = 0;
1199	ctx.clip_vertex_write = 0;
1200
1201	shader->nr_ps_color_exports = 0;
1202	shader->nr_ps_max_color_exports = 0;
1203
1204	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
1205
1206	/* register allocations */
1207	/* Values [0,127] correspond to GPR[0..127].
1208	 * Values [128,159] correspond to constant buffer bank 0
1209	 * Values [160,191] correspond to constant buffer bank 1
1210	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1211	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1212	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1213	 * Other special values are shown in the list below.
1214	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1215	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1216	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1217	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1218	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1219	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1220	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1221	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1222	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1223	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1224	 * 254	SQ_ALU_SRC_PV: previous vector result.
1225	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1226	 */
1227	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1228		ctx.file_offset[i] = 0;
1229	}
1230	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1231		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1232		if (ctx.bc->chip_class >= EVERGREEN) {
1233			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1234		} else {
1235			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
1236		}
1237	}
1238	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1239		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1240	}
1241
1242	/* LLVM backend setup */
1243#ifdef R600_USE_LLVM
1244	if (use_llvm && ctx.info.indirect_files) {
1245		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1246				"indirect adressing.  Falling back to TGSI "
1247				"backend.\n");
1248		use_llvm = 0;
1249	}
1250	if (use_llvm) {
1251		struct radeon_llvm_context radeon_llvm_ctx;
1252		LLVMModuleRef mod;
1253		unsigned dump = 0;
1254		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1255		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
1256		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1257		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
1258			dump = 1;
1259		}
1260		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
1261							rctx->family, dump)) {
1262			FREE(inst_bytes);
1263			radeon_llvm_dispose(&radeon_llvm_ctx);
1264			use_llvm = 0;
1265			fprintf(stderr, "R600 LLVM backend failed to compile "
1266				"shader.  Falling back to TGSI\n");
1267		} else {
1268			ctx.file_offset[TGSI_FILE_OUTPUT] =
1269					ctx.file_offset[TGSI_FILE_INPUT];
1270		}
1271		radeon_llvm_dispose(&radeon_llvm_ctx);
1272	}
1273#endif
1274	/* End of LLVM backend setup */
1275
1276	if (!use_llvm) {
1277		ctx.file_offset[TGSI_FILE_OUTPUT] =
1278			ctx.file_offset[TGSI_FILE_INPUT] +
1279			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1280	}
1281	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1282						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1283
1284	/* Outside the GPR range. This will be translated to one of the
1285	 * kcache banks later. */
1286	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1287
1288	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1289	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1290			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1291	ctx.temp_reg = ctx.bc->ar_reg + 1;
1292
1293	ctx.nliterals = 0;
1294	ctx.literals = NULL;
1295	shader->fs_write_all = FALSE;
1296	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1297		tgsi_parse_token(&ctx.parse);
1298		switch (ctx.parse.FullToken.Token.Type) {
1299		case TGSI_TOKEN_TYPE_IMMEDIATE:
1300			immediate = &ctx.parse.FullToken.FullImmediate;
1301			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1302			if(ctx.literals == NULL) {
1303				r = -ENOMEM;
1304				goto out_err;
1305			}
1306			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1307			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1308			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1309			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1310			ctx.nliterals++;
1311			break;
1312		case TGSI_TOKEN_TYPE_DECLARATION:
1313			r = tgsi_declaration(&ctx);
1314			if (r)
1315				goto out_err;
1316			break;
1317		case TGSI_TOKEN_TYPE_INSTRUCTION:
1318			break;
1319		case TGSI_TOKEN_TYPE_PROPERTY:
1320			property = &ctx.parse.FullToken.FullProperty;
1321			switch (property->Property.PropertyName) {
1322			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1323				if (property->u[0].Data == 1)
1324					shader->fs_write_all = TRUE;
1325				break;
1326			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1327				if (property->u[0].Data == 1)
1328					shader->vs_prohibit_ucps = TRUE;
1329				break;
1330			}
1331			break;
1332		default:
1333			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1334			r = -EINVAL;
1335			goto out_err;
1336		}
1337	}
1338
1339	if (shader->fs_write_all && rctx->chip_class >= EVERGREEN)
1340		shader->nr_ps_max_color_exports = 8;
1341
1342	if (ctx.fragcoord_input >= 0) {
1343		if (ctx.bc->chip_class == CAYMAN) {
1344			for (j = 0 ; j < 4; j++) {
1345				struct r600_bytecode_alu alu;
1346				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1347				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1348				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1349				alu.src[0].chan = 3;
1350
1351				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1352				alu.dst.chan = j;
1353				alu.dst.write = (j == 3);
1354				alu.last = 1;
1355				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1356					return r;
1357			}
1358		} else {
1359			struct r600_bytecode_alu alu;
1360			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1361			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
1362			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1363			alu.src[0].chan = 3;
1364
1365			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1366			alu.dst.chan = 3;
1367			alu.dst.write = 1;
1368			alu.last = 1;
1369			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1370				return r;
1371		}
1372	}
1373
1374	if (shader->two_side && ctx.colors_used) {
1375		if ((r = process_twoside_color_inputs(&ctx)))
1376			return r;
1377	}
1378
1379	tgsi_parse_init(&ctx.parse, tokens);
1380	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1381		tgsi_parse_token(&ctx.parse);
1382		switch (ctx.parse.FullToken.Token.Type) {
1383		case TGSI_TOKEN_TYPE_INSTRUCTION:
1384			if (use_llvm) {
1385				continue;
1386			}
1387			r = tgsi_is_supported(&ctx);
1388			if (r)
1389				goto out_err;
1390			ctx.max_driver_temp_used = 0;
1391			/* reserve first tmp for everyone */
1392			r600_get_temp(&ctx);
1393
1394			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1395			if ((r = tgsi_split_constant(&ctx)))
1396				goto out_err;
1397			if ((r = tgsi_split_literal_constant(&ctx)))
1398				goto out_err;
1399			if (ctx.bc->chip_class == CAYMAN)
1400				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1401			else if (ctx.bc->chip_class >= EVERGREEN)
1402				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1403			else
1404				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1405			r = ctx.inst_info->process(&ctx);
1406			if (r)
1407				goto out_err;
1408			break;
1409		default:
1410			break;
1411		}
1412	}
1413
1414	/* Get instructions if we are using the LLVM backend. */
1415	if (use_llvm) {
1416		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
1417		FREE(inst_bytes);
1418	}
1419
1420	noutput = shader->noutput;
1421
1422	if (ctx.clip_vertex_write) {
1423		/* need to convert a clipvertex write into clipdistance writes and not export
1424		   the clip vertex anymore */
1425
1426		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1427		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1428		shader->output[noutput].gpr = ctx.temp_reg;
1429		noutput++;
1430		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1431		shader->output[noutput].gpr = ctx.temp_reg+1;
1432		noutput++;
1433
1434		/* reset spi_sid for clipvertex output to avoid confusing spi */
1435		shader->output[ctx.cv_output].spi_sid = 0;
1436
1437		shader->clip_dist_write = 0xFF;
1438
1439		for (i = 0; i < 8; i++) {
1440			int oreg = i >> 2;
1441			int ochan = i & 3;
1442
1443			for (j = 0; j < 4; j++) {
1444				struct r600_bytecode_alu alu;
1445				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1446				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
1447				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1448				alu.src[0].chan = j;
1449
1450				alu.src[1].sel = 512 + i;
1451				alu.src[1].kc_bank = 1;
1452				alu.src[1].chan = j;
1453
1454				alu.dst.sel = ctx.temp_reg + oreg;
1455				alu.dst.chan = j;
1456				alu.dst.write = (j == ochan);
1457				if (j == 3)
1458					alu.last = 1;
1459				r = r600_bytecode_add_alu(ctx.bc, &alu);
1460				if (r)
1461					return r;
1462			}
1463		}
1464	}
1465
1466	/* Add stream outputs. */
1467	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
1468		for (i = 0; i < so.num_outputs; i++) {
1469			struct r600_bytecode_output output;
1470
1471			if (so.output[i].output_buffer >= 4) {
1472				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
1473					 so.output[i].output_buffer);
1474				r = -EINVAL;
1475				goto out_err;
1476			}
1477			if (so.output[i].dst_offset < so.output[i].start_component) {
1478			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
1479			   r = -EINVAL;
1480			   goto out_err;
1481			}
1482
1483			memset(&output, 0, sizeof(struct r600_bytecode_output));
1484			output.gpr = shader->output[so.output[i].register_index].gpr;
1485			output.elem_size = 0;
1486			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
1487			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1488			output.burst_count = 1;
1489			output.barrier = 1;
1490			/* array_size is an upper limit for the burst_count
1491			 * with MEM_STREAM instructions */
1492			output.array_size = 0xFFF;
1493			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
1494			if (ctx.bc->chip_class >= EVERGREEN) {
1495				switch (so.output[i].output_buffer) {
1496				case 0:
1497					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
1498					break;
1499				case 1:
1500					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
1501					break;
1502				case 2:
1503					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
1504					break;
1505				case 3:
1506					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
1507					break;
1508				}
1509			} else {
1510				switch (so.output[i].output_buffer) {
1511				case 0:
1512					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
1513					break;
1514				case 1:
1515					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
1516					break;
1517				case 2:
1518					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
1519					break;
1520				case 3:
1521					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
1522					break;
1523				}
1524			}
1525			r = r600_bytecode_add_output(ctx.bc, &output);
1526			if (r)
1527				goto out_err;
1528		}
1529	}
1530
1531	/* export output */
1532	for (i = 0, j = 0; i < noutput; i++, j++) {
1533		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1534		output[j].gpr = shader->output[i].gpr;
1535		output[j].elem_size = 3;
1536		output[j].swizzle_x = 0;
1537		output[j].swizzle_y = 1;
1538		output[j].swizzle_z = 2;
1539		output[j].swizzle_w = 3;
1540		output[j].burst_count = 1;
1541		output[j].barrier = 1;
1542		output[j].type = -1;
1543		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1544		switch (ctx.type) {
1545		case TGSI_PROCESSOR_VERTEX:
1546			switch (shader->output[i].name) {
1547			case TGSI_SEMANTIC_POSITION:
1548				output[j].array_base = next_pos_base++;
1549				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1550				break;
1551
1552			case TGSI_SEMANTIC_PSIZE:
1553				output[j].array_base = next_pos_base++;
1554				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1555				break;
1556			case TGSI_SEMANTIC_CLIPVERTEX:
1557				j--;
1558				break;
1559			case TGSI_SEMANTIC_CLIPDIST:
1560				output[j].array_base = next_pos_base++;
1561				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1562				/* spi_sid is 0 for clipdistance outputs that were generated
1563				 * for clipvertex - we don't need to pass them to PS */
1564				if (shader->output[i].spi_sid) {
1565					j++;
1566					/* duplicate it as PARAM to pass to the pixel shader */
1567					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
1568					output[j].array_base = next_param_base++;
1569					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1570				}
1571				break;
1572			case TGSI_SEMANTIC_FOG:
1573				output[j].swizzle_y = 4; /* 0 */
1574				output[j].swizzle_z = 4; /* 0 */
1575				output[j].swizzle_w = 5; /* 1 */
1576				break;
1577			}
1578			break;
1579		case TGSI_PROCESSOR_FRAGMENT:
1580			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
1581				/* never export more colors than the number of CBs */
1582				if (next_pixel_base && next_pixel_base >= (rctx->nr_cbufs + rctx->dual_src_blend * 1)) {
1583					/* skip export */
1584					j--;
1585					continue;
1586				}
1587				output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1588				output[j].array_base = next_pixel_base++;
1589				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1590				shader->nr_ps_color_exports++;
1591				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
1592					for (k = 1; k < rctx->nr_cbufs; k++) {
1593						j++;
1594						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1595						output[j].gpr = shader->output[i].gpr;
1596						output[j].elem_size = 3;
1597						output[j].swizzle_x = 0;
1598						output[j].swizzle_y = 1;
1599						output[j].swizzle_z = 2;
1600						output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
1601						output[j].burst_count = 1;
1602						output[j].barrier = 1;
1603						output[j].array_base = next_pixel_base++;
1604						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1605						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1606						shader->nr_ps_color_exports++;
1607					}
1608				}
1609			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
1610				output[j].array_base = 61;
1611				output[j].swizzle_x = 2;
1612				output[j].swizzle_y = 7;
1613				output[j].swizzle_z = output[j].swizzle_w = 7;
1614				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1615			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
1616				output[j].array_base = 61;
1617				output[j].swizzle_x = 7;
1618				output[j].swizzle_y = 1;
1619				output[j].swizzle_z = output[j].swizzle_w = 7;
1620				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1621			} else {
1622				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
1623				r = -EINVAL;
1624				goto out_err;
1625			}
1626			break;
1627		default:
1628			R600_ERR("unsupported processor type %d\n", ctx.type);
1629			r = -EINVAL;
1630			goto out_err;
1631		}
1632
1633		if (output[j].type==-1) {
1634			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1635			output[j].array_base = next_param_base++;
1636		}
1637	}
1638
1639	/* add fake param output for vertex shader if no param is exported */
1640	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
1641			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1642			output[j].gpr = 0;
1643			output[j].elem_size = 3;
1644			output[j].swizzle_x = 7;
1645			output[j].swizzle_y = 7;
1646			output[j].swizzle_z = 7;
1647			output[j].swizzle_w = 7;
1648			output[j].burst_count = 1;
1649			output[j].barrier = 1;
1650			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1651			output[j].array_base = 0;
1652			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1653			j++;
1654	}
1655
1656	/* add fake pixel export */
1657	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
1658		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1659		output[j].gpr = 0;
1660		output[j].elem_size = 3;
1661		output[j].swizzle_x = 7;
1662		output[j].swizzle_y = 7;
1663		output[j].swizzle_z = 7;
1664		output[j].swizzle_w = 7;
1665		output[j].burst_count = 1;
1666		output[j].barrier = 1;
1667		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
1668		output[j].array_base = 0;
1669		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
1670		j++;
1671	}
1672
1673	noutput = j;
1674
1675	/* set export done on last export of each type */
1676	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
1677		if (ctx.bc->chip_class < CAYMAN) {
1678			if (i == (noutput - 1)) {
1679				output[i].end_of_program = 1;
1680			}
1681		}
1682		if (!(output_done & (1 << output[i].type))) {
1683			output_done |= (1 << output[i].type);
1684			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
1685		}
1686	}
1687	/* add output to bytecode */
1688	for (i = 0; i < noutput; i++) {
1689		r = r600_bytecode_add_output(ctx.bc, &output[i]);
1690		if (r)
1691			goto out_err;
1692	}
1693	/* add program end */
1694	if (ctx.bc->chip_class == CAYMAN)
1695		cm_bytecode_add_cf_end(ctx.bc);
1696
1697	/* check GPR limit - we have 124 = 128 - 4
1698	 * (4 are reserved as alu clause temporary registers) */
1699	if (ctx.bc->ngpr > 124) {
1700		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
1701		r = -ENOMEM;
1702		goto out_err;
1703	}
1704
1705	free(ctx.literals);
1706	tgsi_parse_free(&ctx.parse);
1707	return 0;
1708out_err:
1709	free(ctx.literals);
1710	tgsi_parse_free(&ctx.parse);
1711	return r;
1712}
1713
1714static int tgsi_unsupported(struct r600_shader_ctx *ctx)
1715{
1716	R600_ERR("%s tgsi opcode unsupported\n",
1717		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
1718	return -EINVAL;
1719}
1720
1721static int tgsi_end(struct r600_shader_ctx *ctx)
1722{
1723	return 0;
1724}
1725
1726static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
1727			const struct r600_shader_src *shader_src,
1728			unsigned chan)
1729{
1730	bc_src->sel = shader_src->sel;
1731	bc_src->chan = shader_src->swizzle[chan];
1732	bc_src->neg = shader_src->neg;
1733	bc_src->abs = shader_src->abs;
1734	bc_src->rel = shader_src->rel;
1735	bc_src->value = shader_src->value[bc_src->chan];
1736}
1737
1738static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
1739{
1740	bc_src->abs = 1;
1741	bc_src->neg = 0;
1742}
1743
1744static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
1745{
1746	bc_src->neg = !bc_src->neg;
1747}
1748
1749static void tgsi_dst(struct r600_shader_ctx *ctx,
1750		     const struct tgsi_full_dst_register *tgsi_dst,
1751		     unsigned swizzle,
1752		     struct r600_bytecode_alu_dst *r600_dst)
1753{
1754	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1755
1756	r600_dst->sel = tgsi_dst->Register.Index;
1757	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
1758	r600_dst->chan = swizzle;
1759	r600_dst->write = 1;
1760	if (tgsi_dst->Register.Indirect)
1761		r600_dst->rel = V_SQ_REL_RELATIVE;
1762	if (inst->Instruction.Saturate) {
1763		r600_dst->clamp = 1;
1764	}
1765}
1766
1767static int tgsi_last_instruction(unsigned writemask)
1768{
1769	int i, lasti = 0;
1770
1771	for (i = 0; i < 4; i++) {
1772		if (writemask & (1 << i)) {
1773			lasti = i;
1774		}
1775	}
1776	return lasti;
1777}
1778
1779static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
1780{
1781	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1782	struct r600_bytecode_alu alu;
1783	int i, j, r;
1784	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1785
1786	for (i = 0; i < lasti + 1; i++) {
1787		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1788			continue;
1789
1790		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1791		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1792
1793		alu.inst = ctx->inst_info->r600_opcode;
1794		if (!swap) {
1795			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1796				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
1797			}
1798		} else {
1799			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
1800			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1801		}
1802		/* handle some special cases */
1803		switch (ctx->inst_info->tgsi_opcode) {
1804		case TGSI_OPCODE_SUB:
1805			r600_bytecode_src_toggle_neg(&alu.src[1]);
1806			break;
1807		case TGSI_OPCODE_ABS:
1808			r600_bytecode_src_set_abs(&alu.src[0]);
1809			break;
1810		default:
1811			break;
1812		}
1813		if (i == lasti || trans_only) {
1814			alu.last = 1;
1815		}
1816		r = r600_bytecode_add_alu(ctx->bc, &alu);
1817		if (r)
1818			return r;
1819	}
1820	return 0;
1821}
1822
1823static int tgsi_op2(struct r600_shader_ctx *ctx)
1824{
1825	return tgsi_op2_s(ctx, 0, 0);
1826}
1827
1828static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
1829{
1830	return tgsi_op2_s(ctx, 1, 0);
1831}
1832
1833static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
1834{
1835	return tgsi_op2_s(ctx, 0, 1);
1836}
1837
1838static int tgsi_ineg(struct r600_shader_ctx *ctx)
1839{
1840	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1841	struct r600_bytecode_alu alu;
1842	int i, r;
1843	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
1844
1845	for (i = 0; i < lasti + 1; i++) {
1846
1847		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
1848			continue;
1849		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1850		alu.inst = ctx->inst_info->r600_opcode;
1851
1852		alu.src[0].sel = V_SQ_ALU_SRC_0;
1853
1854		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
1855
1856		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1857
1858		if (i == lasti) {
1859			alu.last = 1;
1860		}
1861		r = r600_bytecode_add_alu(ctx->bc, &alu);
1862		if (r)
1863			return r;
1864	}
1865	return 0;
1866
1867}
1868
1869static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
1870{
1871	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1872	int i, j, r;
1873	struct r600_bytecode_alu alu;
1874	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1875
1876	for (i = 0 ; i < last_slot; i++) {
1877		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1878		alu.inst = ctx->inst_info->r600_opcode;
1879		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1880			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
1881		}
1882		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1883		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
1884
1885		if (i == last_slot - 1)
1886			alu.last = 1;
1887		r = r600_bytecode_add_alu(ctx->bc, &alu);
1888		if (r)
1889			return r;
1890	}
1891	return 0;
1892}
1893
1894static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
1895{
1896	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1897	int i, j, k, r;
1898	struct r600_bytecode_alu alu;
1899	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
1900	for (k = 0; k < last_slot; k++) {
1901		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
1902			continue;
1903
1904		for (i = 0 ; i < 4; i++) {
1905			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1906			alu.inst = ctx->inst_info->r600_opcode;
1907			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
1908				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
1909			}
1910			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
1911			alu.dst.write = (i == k);
1912			if (i == 3)
1913				alu.last = 1;
1914			r = r600_bytecode_add_alu(ctx->bc, &alu);
1915			if (r)
1916				return r;
1917		}
1918	}
1919	return 0;
1920}
1921
1922/*
1923 * r600 - trunc to -PI..PI range
1924 * r700 - normalize by dividing by 2PI
1925 * see fdo bug 27901
1926 */
1927static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
1928{
1929	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
1930	static float double_pi = 3.1415926535 * 2;
1931	static float neg_pi = -3.1415926535;
1932
1933	int r;
1934	struct r600_bytecode_alu alu;
1935
1936	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1937	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1938	alu.is_op3 = 1;
1939
1940	alu.dst.chan = 0;
1941	alu.dst.sel = ctx->temp_reg;
1942	alu.dst.write = 1;
1943
1944	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
1945
1946	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1947	alu.src[1].chan = 0;
1948	alu.src[1].value = *(uint32_t *)&half_inv_pi;
1949	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1950	alu.src[2].chan = 0;
1951	alu.last = 1;
1952	r = r600_bytecode_add_alu(ctx->bc, &alu);
1953	if (r)
1954		return r;
1955
1956	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1957	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
1958
1959	alu.dst.chan = 0;
1960	alu.dst.sel = ctx->temp_reg;
1961	alu.dst.write = 1;
1962
1963	alu.src[0].sel = ctx->temp_reg;
1964	alu.src[0].chan = 0;
1965	alu.last = 1;
1966	r = r600_bytecode_add_alu(ctx->bc, &alu);
1967	if (r)
1968		return r;
1969
1970	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1971	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
1972	alu.is_op3 = 1;
1973
1974	alu.dst.chan = 0;
1975	alu.dst.sel = ctx->temp_reg;
1976	alu.dst.write = 1;
1977
1978	alu.src[0].sel = ctx->temp_reg;
1979	alu.src[0].chan = 0;
1980
1981	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1982	alu.src[1].chan = 0;
1983	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
1984	alu.src[2].chan = 0;
1985
1986	if (ctx->bc->chip_class == R600) {
1987		alu.src[1].value = *(uint32_t *)&double_pi;
1988		alu.src[2].value = *(uint32_t *)&neg_pi;
1989	} else {
1990		alu.src[1].sel = V_SQ_ALU_SRC_1;
1991		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
1992		alu.src[2].neg = 1;
1993	}
1994
1995	alu.last = 1;
1996	r = r600_bytecode_add_alu(ctx->bc, &alu);
1997	if (r)
1998		return r;
1999	return 0;
2000}
2001
2002static int cayman_trig(struct r600_shader_ctx *ctx)
2003{
2004	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2005	struct r600_bytecode_alu alu;
2006	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2007	int i, r;
2008
2009	r = tgsi_setup_trig(ctx);
2010	if (r)
2011		return r;
2012
2013
2014	for (i = 0; i < last_slot; i++) {
2015		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2016		alu.inst = ctx->inst_info->r600_opcode;
2017		alu.dst.chan = i;
2018
2019		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2020		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2021
2022		alu.src[0].sel = ctx->temp_reg;
2023		alu.src[0].chan = 0;
2024		if (i == last_slot - 1)
2025			alu.last = 1;
2026		r = r600_bytecode_add_alu(ctx->bc, &alu);
2027		if (r)
2028			return r;
2029	}
2030	return 0;
2031}
2032
2033static int tgsi_trig(struct r600_shader_ctx *ctx)
2034{
2035	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2036	struct r600_bytecode_alu alu;
2037	int i, r;
2038	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2039
2040	r = tgsi_setup_trig(ctx);
2041	if (r)
2042		return r;
2043
2044	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2045	alu.inst = ctx->inst_info->r600_opcode;
2046	alu.dst.chan = 0;
2047	alu.dst.sel = ctx->temp_reg;
2048	alu.dst.write = 1;
2049
2050	alu.src[0].sel = ctx->temp_reg;
2051	alu.src[0].chan = 0;
2052	alu.last = 1;
2053	r = r600_bytecode_add_alu(ctx->bc, &alu);
2054	if (r)
2055		return r;
2056
2057	/* replicate result */
2058	for (i = 0; i < lasti + 1; i++) {
2059		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2060			continue;
2061
2062		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2063		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2064
2065		alu.src[0].sel = ctx->temp_reg;
2066		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2067		if (i == lasti)
2068			alu.last = 1;
2069		r = r600_bytecode_add_alu(ctx->bc, &alu);
2070		if (r)
2071			return r;
2072	}
2073	return 0;
2074}
2075
2076static int tgsi_scs(struct r600_shader_ctx *ctx)
2077{
2078	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2079	struct r600_bytecode_alu alu;
2080	int i, r;
2081
2082	/* We'll only need the trig stuff if we are going to write to the
2083	 * X or Y components of the destination vector.
2084	 */
2085	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2086		r = tgsi_setup_trig(ctx);
2087		if (r)
2088			return r;
2089	}
2090
2091	/* dst.x = COS */
2092	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2093		if (ctx->bc->chip_class == CAYMAN) {
2094			for (i = 0 ; i < 3; i++) {
2095				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2096				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2097				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2098
2099				if (i == 0)
2100					alu.dst.write = 1;
2101				else
2102					alu.dst.write = 0;
2103				alu.src[0].sel = ctx->temp_reg;
2104				alu.src[0].chan = 0;
2105				if (i == 2)
2106					alu.last = 1;
2107				r = r600_bytecode_add_alu(ctx->bc, &alu);
2108				if (r)
2109					return r;
2110			}
2111		} else {
2112			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2113			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
2114			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2115
2116			alu.src[0].sel = ctx->temp_reg;
2117			alu.src[0].chan = 0;
2118			alu.last = 1;
2119			r = r600_bytecode_add_alu(ctx->bc, &alu);
2120			if (r)
2121				return r;
2122		}
2123	}
2124
2125	/* dst.y = SIN */
2126	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2127		if (ctx->bc->chip_class == CAYMAN) {
2128			for (i = 0 ; i < 3; i++) {
2129				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2130				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2131				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2132				if (i == 1)
2133					alu.dst.write = 1;
2134				else
2135					alu.dst.write = 0;
2136				alu.src[0].sel = ctx->temp_reg;
2137				alu.src[0].chan = 0;
2138				if (i == 2)
2139					alu.last = 1;
2140				r = r600_bytecode_add_alu(ctx->bc, &alu);
2141				if (r)
2142					return r;
2143			}
2144		} else {
2145			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2146			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
2147			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2148
2149			alu.src[0].sel = ctx->temp_reg;
2150			alu.src[0].chan = 0;
2151			alu.last = 1;
2152			r = r600_bytecode_add_alu(ctx->bc, &alu);
2153			if (r)
2154				return r;
2155		}
2156	}
2157
2158	/* dst.z = 0.0; */
2159	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2160		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2161
2162		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2163
2164		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2165
2166		alu.src[0].sel = V_SQ_ALU_SRC_0;
2167		alu.src[0].chan = 0;
2168
2169		alu.last = 1;
2170
2171		r = r600_bytecode_add_alu(ctx->bc, &alu);
2172		if (r)
2173			return r;
2174	}
2175
2176	/* dst.w = 1.0; */
2177	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2178		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2179
2180		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2181
2182		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2183
2184		alu.src[0].sel = V_SQ_ALU_SRC_1;
2185		alu.src[0].chan = 0;
2186
2187		alu.last = 1;
2188
2189		r = r600_bytecode_add_alu(ctx->bc, &alu);
2190		if (r)
2191			return r;
2192	}
2193
2194	return 0;
2195}
2196
2197static int tgsi_kill(struct r600_shader_ctx *ctx)
2198{
2199	struct r600_bytecode_alu alu;
2200	int i, r;
2201
2202	for (i = 0; i < 4; i++) {
2203		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2204		alu.inst = ctx->inst_info->r600_opcode;
2205
2206		alu.dst.chan = i;
2207
2208		alu.src[0].sel = V_SQ_ALU_SRC_0;
2209
2210		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
2211			alu.src[1].sel = V_SQ_ALU_SRC_1;
2212			alu.src[1].neg = 1;
2213		} else {
2214			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2215		}
2216		if (i == 3) {
2217			alu.last = 1;
2218		}
2219		r = r600_bytecode_add_alu(ctx->bc, &alu);
2220		if (r)
2221			return r;
2222	}
2223
2224	/* kill must be last in ALU */
2225	ctx->bc->force_add_cf = 1;
2226	ctx->shader->uses_kill = TRUE;
2227	return 0;
2228}
2229
2230static int tgsi_lit(struct r600_shader_ctx *ctx)
2231{
2232	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2233	struct r600_bytecode_alu alu;
2234	int r;
2235
2236	/* tmp.x = max(src.y, 0.0) */
2237	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2238	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2239	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2240	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2241	alu.src[1].chan = 1;
2242
2243	alu.dst.sel = ctx->temp_reg;
2244	alu.dst.chan = 0;
2245	alu.dst.write = 1;
2246
2247	alu.last = 1;
2248	r = r600_bytecode_add_alu(ctx->bc, &alu);
2249	if (r)
2250		return r;
2251
2252	if (inst->Dst[0].Register.WriteMask & (1 << 2))
2253	{
2254		int chan;
2255		int sel;
2256		int i;
2257
2258		if (ctx->bc->chip_class == CAYMAN) {
2259			for (i = 0; i < 3; i++) {
2260				/* tmp.z = log(tmp.x) */
2261				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2262				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2263				alu.src[0].sel = ctx->temp_reg;
2264				alu.src[0].chan = 0;
2265				alu.dst.sel = ctx->temp_reg;
2266				alu.dst.chan = i;
2267				if (i == 2) {
2268					alu.dst.write = 1;
2269					alu.last = 1;
2270				} else
2271					alu.dst.write = 0;
2272
2273				r = r600_bytecode_add_alu(ctx->bc, &alu);
2274				if (r)
2275					return r;
2276			}
2277		} else {
2278			/* tmp.z = log(tmp.x) */
2279			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2280			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
2281			alu.src[0].sel = ctx->temp_reg;
2282			alu.src[0].chan = 0;
2283			alu.dst.sel = ctx->temp_reg;
2284			alu.dst.chan = 2;
2285			alu.dst.write = 1;
2286			alu.last = 1;
2287			r = r600_bytecode_add_alu(ctx->bc, &alu);
2288			if (r)
2289				return r;
2290		}
2291
2292		chan = alu.dst.chan;
2293		sel = alu.dst.sel;
2294
2295		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2296		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2297		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
2298		alu.src[0].sel  = sel;
2299		alu.src[0].chan = chan;
2300		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2301		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2302		alu.dst.sel = ctx->temp_reg;
2303		alu.dst.chan = 0;
2304		alu.dst.write = 1;
2305		alu.is_op3 = 1;
2306		alu.last = 1;
2307		r = r600_bytecode_add_alu(ctx->bc, &alu);
2308		if (r)
2309			return r;
2310
2311		if (ctx->bc->chip_class == CAYMAN) {
2312			for (i = 0; i < 3; i++) {
2313				/* dst.z = exp(tmp.x) */
2314				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2315				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2316				alu.src[0].sel = ctx->temp_reg;
2317				alu.src[0].chan = 0;
2318				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2319				if (i == 2) {
2320					alu.dst.write = 1;
2321					alu.last = 1;
2322				} else
2323					alu.dst.write = 0;
2324				r = r600_bytecode_add_alu(ctx->bc, &alu);
2325				if (r)
2326					return r;
2327			}
2328		} else {
2329			/* dst.z = exp(tmp.x) */
2330			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2331			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2332			alu.src[0].sel = ctx->temp_reg;
2333			alu.src[0].chan = 0;
2334			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2335			alu.last = 1;
2336			r = r600_bytecode_add_alu(ctx->bc, &alu);
2337			if (r)
2338				return r;
2339		}
2340	}
2341
2342	/* dst.x, <- 1.0  */
2343	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2344	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2345	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
2346	alu.src[0].chan = 0;
2347	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2348	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2349	r = r600_bytecode_add_alu(ctx->bc, &alu);
2350	if (r)
2351		return r;
2352
2353	/* dst.y = max(src.x, 0.0) */
2354	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2355	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
2356	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2357	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
2358	alu.src[1].chan = 0;
2359	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2360	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2361	r = r600_bytecode_add_alu(ctx->bc, &alu);
2362	if (r)
2363		return r;
2364
2365	/* dst.w, <- 1.0  */
2366	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2367	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2368	alu.src[0].sel  = V_SQ_ALU_SRC_1;
2369	alu.src[0].chan = 0;
2370	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2371	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2372	alu.last = 1;
2373	r = r600_bytecode_add_alu(ctx->bc, &alu);
2374	if (r)
2375		return r;
2376
2377	return 0;
2378}
2379
2380static int tgsi_rsq(struct r600_shader_ctx *ctx)
2381{
2382	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2383	struct r600_bytecode_alu alu;
2384	int i, r;
2385
2386	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2387
2388	/* XXX:
2389	 * For state trackers other than OpenGL, we'll want to use
2390	 * _RECIPSQRT_IEEE instead.
2391	 */
2392	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
2393
2394	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2395		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2396		r600_bytecode_src_set_abs(&alu.src[i]);
2397	}
2398	alu.dst.sel = ctx->temp_reg;
2399	alu.dst.write = 1;
2400	alu.last = 1;
2401	r = r600_bytecode_add_alu(ctx->bc, &alu);
2402	if (r)
2403		return r;
2404	/* replicate result */
2405	return tgsi_helper_tempx_replicate(ctx);
2406}
2407
2408static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2409{
2410	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2411	struct r600_bytecode_alu alu;
2412	int i, r;
2413
2414	for (i = 0; i < 4; i++) {
2415		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2416		alu.src[0].sel = ctx->temp_reg;
2417		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
2418		alu.dst.chan = i;
2419		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2420		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2421		if (i == 3)
2422			alu.last = 1;
2423		r = r600_bytecode_add_alu(ctx->bc, &alu);
2424		if (r)
2425			return r;
2426	}
2427	return 0;
2428}
2429
2430static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2431{
2432	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2433	struct r600_bytecode_alu alu;
2434	int i, r;
2435
2436	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2437	alu.inst = ctx->inst_info->r600_opcode;
2438	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2439		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2440	}
2441	alu.dst.sel = ctx->temp_reg;
2442	alu.dst.write = 1;
2443	alu.last = 1;
2444	r = r600_bytecode_add_alu(ctx->bc, &alu);
2445	if (r)
2446		return r;
2447	/* replicate result */
2448	return tgsi_helper_tempx_replicate(ctx);
2449}
2450
2451static int cayman_pow(struct r600_shader_ctx *ctx)
2452{
2453	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2454	int i, r;
2455	struct r600_bytecode_alu alu;
2456	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2457
2458	for (i = 0; i < 3; i++) {
2459		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2460		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2461		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2462		alu.dst.sel = ctx->temp_reg;
2463		alu.dst.chan = i;
2464		alu.dst.write = 1;
2465		if (i == 2)
2466			alu.last = 1;
2467		r = r600_bytecode_add_alu(ctx->bc, &alu);
2468		if (r)
2469			return r;
2470	}
2471
2472	/* b * LOG2(a) */
2473	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2474	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2475	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2476	alu.src[1].sel = ctx->temp_reg;
2477	alu.dst.sel = ctx->temp_reg;
2478	alu.dst.write = 1;
2479	alu.last = 1;
2480	r = r600_bytecode_add_alu(ctx->bc, &alu);
2481	if (r)
2482		return r;
2483
2484	for (i = 0; i < last_slot; i++) {
2485		/* POW(a,b) = EXP2(b * LOG2(a))*/
2486		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2487		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2488		alu.src[0].sel = ctx->temp_reg;
2489
2490		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2491		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2492		if (i == last_slot - 1)
2493			alu.last = 1;
2494		r = r600_bytecode_add_alu(ctx->bc, &alu);
2495		if (r)
2496			return r;
2497	}
2498	return 0;
2499}
2500
2501static int tgsi_pow(struct r600_shader_ctx *ctx)
2502{
2503	struct r600_bytecode_alu alu;
2504	int r;
2505
2506	/* LOG2(a) */
2507	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2508	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
2509	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2510	alu.dst.sel = ctx->temp_reg;
2511	alu.dst.write = 1;
2512	alu.last = 1;
2513	r = r600_bytecode_add_alu(ctx->bc, &alu);
2514	if (r)
2515		return r;
2516	/* b * LOG2(a) */
2517	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2518	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2519	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
2520	alu.src[1].sel = ctx->temp_reg;
2521	alu.dst.sel = ctx->temp_reg;
2522	alu.dst.write = 1;
2523	alu.last = 1;
2524	r = r600_bytecode_add_alu(ctx->bc, &alu);
2525	if (r)
2526		return r;
2527	/* POW(a,b) = EXP2(b * LOG2(a))*/
2528	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2529	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
2530	alu.src[0].sel = ctx->temp_reg;
2531	alu.dst.sel = ctx->temp_reg;
2532	alu.dst.write = 1;
2533	alu.last = 1;
2534	r = r600_bytecode_add_alu(ctx->bc, &alu);
2535	if (r)
2536		return r;
2537	return tgsi_helper_tempx_replicate(ctx);
2538}
2539
2540static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
2541{
2542	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2543	struct r600_bytecode_alu alu;
2544	int i, r, j;
2545	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2546	int tmp0 = ctx->temp_reg;
2547	int tmp1 = r600_get_temp(ctx);
2548	int tmp2 = r600_get_temp(ctx);
2549	int tmp3 = r600_get_temp(ctx);
2550	/* Unsigned path:
2551	 *
2552	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
2553	 *
2554	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
2555	 * 2. tmp0.z = lo (tmp0.x * src2)
2556	 * 3. tmp0.w = -tmp0.z
2557	 * 4. tmp0.y = hi (tmp0.x * src2)
2558	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
2559	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
2560	 * 7. tmp1.x = tmp0.x - tmp0.w
2561	 * 8. tmp1.y = tmp0.x + tmp0.w
2562	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
2563	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
2564	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
2565	 *
2566	 * 12. tmp0.w = src1 - tmp0.y       = r
2567	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
2568	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
2569	 *
2570	 * if DIV
2571	 *
2572	 *   15. tmp1.z = tmp0.z + 1			= q + 1
2573	 *   16. tmp1.w = tmp0.z - 1			= q - 1
2574	 *
2575	 * else MOD
2576	 *
2577	 *   15. tmp1.z = tmp0.w - src2			= r - src2
2578	 *   16. tmp1.w = tmp0.w + src2			= r + src2
2579	 *
2580	 * endif
2581	 *
2582	 * 17. tmp1.x = tmp1.x & tmp1.y
2583	 *
2584	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
2585	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
2586	 *
2587	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
2588	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
2589	 *
2590	 * Signed path:
2591	 *
2592	 * Same as unsigned, using abs values of the operands,
2593	 * and fixing the sign of the result in the end.
2594	 */
2595
2596	for (i = 0; i < 4; i++) {
2597		if (!(write_mask & (1<<i)))
2598			continue;
2599
2600		if (signed_op) {
2601
2602			/* tmp2.x = -src0 */
2603			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2604			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2605
2606			alu.dst.sel = tmp2;
2607			alu.dst.chan = 0;
2608			alu.dst.write = 1;
2609
2610			alu.src[0].sel = V_SQ_ALU_SRC_0;
2611
2612			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2613
2614			alu.last = 1;
2615			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2616				return r;
2617
2618			/* tmp2.y = -src1 */
2619			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2620			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2621
2622			alu.dst.sel = tmp2;
2623			alu.dst.chan = 1;
2624			alu.dst.write = 1;
2625
2626			alu.src[0].sel = V_SQ_ALU_SRC_0;
2627
2628			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2629
2630			alu.last = 1;
2631			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2632				return r;
2633
2634			/* tmp2.z sign bit is set if src0 and src2 signs are different */
2635			/* it will be a sign of the quotient */
2636			if (!mod) {
2637
2638				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2639				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
2640
2641				alu.dst.sel = tmp2;
2642				alu.dst.chan = 2;
2643				alu.dst.write = 1;
2644
2645				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2646				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2647
2648				alu.last = 1;
2649				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2650					return r;
2651			}
2652
2653			/* tmp2.x = |src0| */
2654			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2655			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2656			alu.is_op3 = 1;
2657
2658			alu.dst.sel = tmp2;
2659			alu.dst.chan = 0;
2660			alu.dst.write = 1;
2661
2662			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2663			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2664			alu.src[2].sel = tmp2;
2665			alu.src[2].chan = 0;
2666
2667			alu.last = 1;
2668			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2669				return r;
2670
2671			/* tmp2.y = |src1| */
2672			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2673			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
2674			alu.is_op3 = 1;
2675
2676			alu.dst.sel = tmp2;
2677			alu.dst.chan = 1;
2678			alu.dst.write = 1;
2679
2680			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2681			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2682			alu.src[2].sel = tmp2;
2683			alu.src[2].chan = 1;
2684
2685			alu.last = 1;
2686			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2687				return r;
2688
2689		}
2690
2691		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
2692		if (ctx->bc->chip_class == CAYMAN) {
2693			/* tmp3.x = u2f(src2) */
2694			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2695			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
2696
2697			alu.dst.sel = tmp3;
2698			alu.dst.chan = 0;
2699			alu.dst.write = 1;
2700
2701			if (signed_op) {
2702				alu.src[0].sel = tmp2;
2703				alu.src[0].chan = 1;
2704			} else {
2705				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2706			}
2707
2708			alu.last = 1;
2709			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2710				return r;
2711
2712			/* tmp0.x = recip(tmp3.x) */
2713			for (j = 0 ; j < 3; j++) {
2714				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2715				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
2716
2717				alu.dst.sel = tmp0;
2718				alu.dst.chan = j;
2719				alu.dst.write = (j == 0);
2720
2721				alu.src[0].sel = tmp3;
2722				alu.src[0].chan = 0;
2723
2724				if (j == 2)
2725					alu.last = 1;
2726				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2727					return r;
2728			}
2729
2730			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2731			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
2732
2733			alu.src[0].sel = tmp0;
2734			alu.src[0].chan = 0;
2735
2736			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2737			alu.src[1].value = 0x4f800000;
2738
2739			alu.dst.sel = tmp3;
2740			alu.dst.write = 1;
2741			alu.last = 1;
2742			r = r600_bytecode_add_alu(ctx->bc, &alu);
2743			if (r)
2744				return r;
2745
2746			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2747			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
2748
2749			alu.dst.sel = tmp0;
2750			alu.dst.chan = 0;
2751			alu.dst.write = 1;
2752
2753			alu.src[0].sel = tmp3;
2754			alu.src[0].chan = 0;
2755
2756			alu.last = 1;
2757			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2758				return r;
2759
2760		} else {
2761			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2762			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
2763
2764			alu.dst.sel = tmp0;
2765			alu.dst.chan = 0;
2766			alu.dst.write = 1;
2767
2768			if (signed_op) {
2769				alu.src[0].sel = tmp2;
2770				alu.src[0].chan = 1;
2771			} else {
2772				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2773			}
2774
2775			alu.last = 1;
2776			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2777				return r;
2778		}
2779
2780		/* 2. tmp0.z = lo (tmp0.x * src2) */
2781		if (ctx->bc->chip_class == CAYMAN) {
2782			for (j = 0 ; j < 4; j++) {
2783				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2784				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2785
2786				alu.dst.sel = tmp0;
2787				alu.dst.chan = j;
2788				alu.dst.write = (j == 2);
2789
2790				alu.src[0].sel = tmp0;
2791				alu.src[0].chan = 0;
2792				if (signed_op) {
2793					alu.src[1].sel = tmp2;
2794					alu.src[1].chan = 1;
2795				} else {
2796					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2797				}
2798
2799				alu.last = (j == 3);
2800				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2801					return r;
2802			}
2803		} else {
2804			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2805			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
2806
2807			alu.dst.sel = tmp0;
2808			alu.dst.chan = 2;
2809			alu.dst.write = 1;
2810
2811			alu.src[0].sel = tmp0;
2812			alu.src[0].chan = 0;
2813			if (signed_op) {
2814				alu.src[1].sel = tmp2;
2815				alu.src[1].chan = 1;
2816			} else {
2817				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2818			}
2819
2820			alu.last = 1;
2821			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2822				return r;
2823		}
2824
2825		/* 3. tmp0.w = -tmp0.z */
2826		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2827		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2828
2829		alu.dst.sel = tmp0;
2830		alu.dst.chan = 3;
2831		alu.dst.write = 1;
2832
2833		alu.src[0].sel = V_SQ_ALU_SRC_0;
2834		alu.src[1].sel = tmp0;
2835		alu.src[1].chan = 2;
2836
2837		alu.last = 1;
2838		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2839			return r;
2840
2841		/* 4. tmp0.y = hi (tmp0.x * src2) */
2842		if (ctx->bc->chip_class == CAYMAN) {
2843			for (j = 0 ; j < 4; j++) {
2844				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2845				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2846
2847				alu.dst.sel = tmp0;
2848				alu.dst.chan = j;
2849				alu.dst.write = (j == 1);
2850
2851				alu.src[0].sel = tmp0;
2852				alu.src[0].chan = 0;
2853
2854				if (signed_op) {
2855					alu.src[1].sel = tmp2;
2856					alu.src[1].chan = 1;
2857				} else {
2858					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2859				}
2860				alu.last = (j == 3);
2861				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2862					return r;
2863			}
2864		} else {
2865			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2866			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2867
2868			alu.dst.sel = tmp0;
2869			alu.dst.chan = 1;
2870			alu.dst.write = 1;
2871
2872			alu.src[0].sel = tmp0;
2873			alu.src[0].chan = 0;
2874
2875			if (signed_op) {
2876				alu.src[1].sel = tmp2;
2877				alu.src[1].chan = 1;
2878			} else {
2879				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
2880			}
2881
2882			alu.last = 1;
2883			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2884				return r;
2885		}
2886
2887		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
2888		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2889		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2890		alu.is_op3 = 1;
2891
2892		alu.dst.sel = tmp0;
2893		alu.dst.chan = 2;
2894		alu.dst.write = 1;
2895
2896		alu.src[0].sel = tmp0;
2897		alu.src[0].chan = 1;
2898		alu.src[1].sel = tmp0;
2899		alu.src[1].chan = 3;
2900		alu.src[2].sel = tmp0;
2901		alu.src[2].chan = 2;
2902
2903		alu.last = 1;
2904		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2905			return r;
2906
2907		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
2908		if (ctx->bc->chip_class == CAYMAN) {
2909			for (j = 0 ; j < 4; j++) {
2910				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2911				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2912
2913				alu.dst.sel = tmp0;
2914				alu.dst.chan = j;
2915				alu.dst.write = (j == 3);
2916
2917				alu.src[0].sel = tmp0;
2918				alu.src[0].chan = 2;
2919
2920				alu.src[1].sel = tmp0;
2921				alu.src[1].chan = 0;
2922
2923				alu.last = (j == 3);
2924				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2925					return r;
2926			}
2927		} else {
2928			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2929			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2930
2931			alu.dst.sel = tmp0;
2932			alu.dst.chan = 3;
2933			alu.dst.write = 1;
2934
2935			alu.src[0].sel = tmp0;
2936			alu.src[0].chan = 2;
2937
2938			alu.src[1].sel = tmp0;
2939			alu.src[1].chan = 0;
2940
2941			alu.last = 1;
2942			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2943				return r;
2944		}
2945
2946		/* 7. tmp1.x = tmp0.x - tmp0.w */
2947		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2948		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
2949
2950		alu.dst.sel = tmp1;
2951		alu.dst.chan = 0;
2952		alu.dst.write = 1;
2953
2954		alu.src[0].sel = tmp0;
2955		alu.src[0].chan = 0;
2956		alu.src[1].sel = tmp0;
2957		alu.src[1].chan = 3;
2958
2959		alu.last = 1;
2960		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2961			return r;
2962
2963		/* 8. tmp1.y = tmp0.x + tmp0.w */
2964		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2965		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
2966
2967		alu.dst.sel = tmp1;
2968		alu.dst.chan = 1;
2969		alu.dst.write = 1;
2970
2971		alu.src[0].sel = tmp0;
2972		alu.src[0].chan = 0;
2973		alu.src[1].sel = tmp0;
2974		alu.src[1].chan = 3;
2975
2976		alu.last = 1;
2977		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2978			return r;
2979
2980		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
2981		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2982		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
2983		alu.is_op3 = 1;
2984
2985		alu.dst.sel = tmp0;
2986		alu.dst.chan = 0;
2987		alu.dst.write = 1;
2988
2989		alu.src[0].sel = tmp0;
2990		alu.src[0].chan = 1;
2991		alu.src[1].sel = tmp1;
2992		alu.src[1].chan = 1;
2993		alu.src[2].sel = tmp1;
2994		alu.src[2].chan = 0;
2995
2996		alu.last = 1;
2997		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
2998			return r;
2999
3000		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
3001		if (ctx->bc->chip_class == CAYMAN) {
3002			for (j = 0 ; j < 4; j++) {
3003				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3004				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3005
3006				alu.dst.sel = tmp0;
3007				alu.dst.chan = j;
3008				alu.dst.write = (j == 2);
3009
3010				alu.src[0].sel = tmp0;
3011				alu.src[0].chan = 0;
3012
3013				if (signed_op) {
3014					alu.src[1].sel = tmp2;
3015					alu.src[1].chan = 0;
3016				} else {
3017					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3018				}
3019
3020				alu.last = (j == 3);
3021				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3022					return r;
3023			}
3024		} else {
3025			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3026			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
3027
3028			alu.dst.sel = tmp0;
3029			alu.dst.chan = 2;
3030			alu.dst.write = 1;
3031
3032			alu.src[0].sel = tmp0;
3033			alu.src[0].chan = 0;
3034
3035			if (signed_op) {
3036				alu.src[1].sel = tmp2;
3037				alu.src[1].chan = 0;
3038			} else {
3039				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3040			}
3041
3042			alu.last = 1;
3043			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3044				return r;
3045		}
3046
3047		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
3048		if (ctx->bc->chip_class == CAYMAN) {
3049			for (j = 0 ; j < 4; j++) {
3050				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3051				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3052
3053				alu.dst.sel = tmp0;
3054				alu.dst.chan = j;
3055				alu.dst.write = (j == 1);
3056
3057				if (signed_op) {
3058					alu.src[0].sel = tmp2;
3059					alu.src[0].chan = 1;
3060				} else {
3061					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3062				}
3063
3064				alu.src[1].sel = tmp0;
3065				alu.src[1].chan = 2;
3066
3067				alu.last = (j == 3);
3068				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3069					return r;
3070			}
3071		} else {
3072			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3073			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
3074
3075			alu.dst.sel = tmp0;
3076			alu.dst.chan = 1;
3077			alu.dst.write = 1;
3078
3079			if (signed_op) {
3080				alu.src[0].sel = tmp2;
3081				alu.src[0].chan = 1;
3082			} else {
3083				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3084			}
3085
3086			alu.src[1].sel = tmp0;
3087			alu.src[1].chan = 2;
3088
3089			alu.last = 1;
3090			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3091				return r;
3092		}
3093
3094		/* 12. tmp0.w = src1 - tmp0.y       = r */
3095		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3096		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3097
3098		alu.dst.sel = tmp0;
3099		alu.dst.chan = 3;
3100		alu.dst.write = 1;
3101
3102		if (signed_op) {
3103			alu.src[0].sel = tmp2;
3104			alu.src[0].chan = 0;
3105		} else {
3106			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3107		}
3108
3109		alu.src[1].sel = tmp0;
3110		alu.src[1].chan = 1;
3111
3112		alu.last = 1;
3113		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3114			return r;
3115
3116		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
3117		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3118		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3119
3120		alu.dst.sel = tmp1;
3121		alu.dst.chan = 0;
3122		alu.dst.write = 1;
3123
3124		alu.src[0].sel = tmp0;
3125		alu.src[0].chan = 3;
3126		if (signed_op) {
3127			alu.src[1].sel = tmp2;
3128			alu.src[1].chan = 1;
3129		} else {
3130			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3131		}
3132
3133		alu.last = 1;
3134		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3135			return r;
3136
3137		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
3138		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3139		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
3140
3141		alu.dst.sel = tmp1;
3142		alu.dst.chan = 1;
3143		alu.dst.write = 1;
3144
3145		if (signed_op) {
3146			alu.src[0].sel = tmp2;
3147			alu.src[0].chan = 0;
3148		} else {
3149			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3150		}
3151
3152		alu.src[1].sel = tmp0;
3153		alu.src[1].chan = 1;
3154
3155		alu.last = 1;
3156		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3157			return r;
3158
3159		if (mod) { /* UMOD */
3160
3161			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
3162			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3163			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3164
3165			alu.dst.sel = tmp1;
3166			alu.dst.chan = 2;
3167			alu.dst.write = 1;
3168
3169			alu.src[0].sel = tmp0;
3170			alu.src[0].chan = 3;
3171
3172			if (signed_op) {
3173				alu.src[1].sel = tmp2;
3174				alu.src[1].chan = 1;
3175			} else {
3176				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3177			}
3178
3179			alu.last = 1;
3180			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3181				return r;
3182
3183			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
3184			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3185			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3186
3187			alu.dst.sel = tmp1;
3188			alu.dst.chan = 3;
3189			alu.dst.write = 1;
3190
3191			alu.src[0].sel = tmp0;
3192			alu.src[0].chan = 3;
3193			if (signed_op) {
3194				alu.src[1].sel = tmp2;
3195				alu.src[1].chan = 1;
3196			} else {
3197				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3198			}
3199
3200			alu.last = 1;
3201			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3202				return r;
3203
3204		} else { /* UDIV */
3205
3206			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
3207			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3208			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3209
3210			alu.dst.sel = tmp1;
3211			alu.dst.chan = 2;
3212			alu.dst.write = 1;
3213
3214			alu.src[0].sel = tmp0;
3215			alu.src[0].chan = 2;
3216			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3217
3218			alu.last = 1;
3219			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3220				return r;
3221
3222			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
3223			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3224			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
3225
3226			alu.dst.sel = tmp1;
3227			alu.dst.chan = 3;
3228			alu.dst.write = 1;
3229
3230			alu.src[0].sel = tmp0;
3231			alu.src[0].chan = 2;
3232			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3233
3234			alu.last = 1;
3235			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3236				return r;
3237
3238		}
3239
3240		/* 17. tmp1.x = tmp1.x & tmp1.y */
3241		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3242		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
3243
3244		alu.dst.sel = tmp1;
3245		alu.dst.chan = 0;
3246		alu.dst.write = 1;
3247
3248		alu.src[0].sel = tmp1;
3249		alu.src[0].chan = 0;
3250		alu.src[1].sel = tmp1;
3251		alu.src[1].chan = 1;
3252
3253		alu.last = 1;
3254		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3255			return r;
3256
3257		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
3258		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
3259		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3260		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3261		alu.is_op3 = 1;
3262
3263		alu.dst.sel = tmp0;
3264		alu.dst.chan = 2;
3265		alu.dst.write = 1;
3266
3267		alu.src[0].sel = tmp1;
3268		alu.src[0].chan = 0;
3269		alu.src[1].sel = tmp0;
3270		alu.src[1].chan = mod ? 3 : 2;
3271		alu.src[2].sel = tmp1;
3272		alu.src[2].chan = 2;
3273
3274		alu.last = 1;
3275		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3276			return r;
3277
3278		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3279		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3280		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
3281		alu.is_op3 = 1;
3282
3283		if (signed_op) {
3284			alu.dst.sel = tmp0;
3285			alu.dst.chan = 2;
3286			alu.dst.write = 1;
3287		} else {
3288			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3289		}
3290
3291		alu.src[0].sel = tmp1;
3292		alu.src[0].chan = 1;
3293		alu.src[1].sel = tmp1;
3294		alu.src[1].chan = 3;
3295		alu.src[2].sel = tmp0;
3296		alu.src[2].chan = 2;
3297
3298		alu.last = 1;
3299		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3300			return r;
3301
3302		if (signed_op) {
3303
3304			/* fix the sign of the result */
3305
3306			if (mod) {
3307
3308				/* tmp0.x = -tmp0.z */
3309				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3310				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3311
3312				alu.dst.sel = tmp0;
3313				alu.dst.chan = 0;
3314				alu.dst.write = 1;
3315
3316				alu.src[0].sel = V_SQ_ALU_SRC_0;
3317				alu.src[1].sel = tmp0;
3318				alu.src[1].chan = 2;
3319
3320				alu.last = 1;
3321				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3322					return r;
3323
3324				/* sign of the remainder is the same as the sign of src0 */
3325				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3326				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3327				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3328				alu.is_op3 = 1;
3329
3330				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3331
3332				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3333				alu.src[1].sel = tmp0;
3334				alu.src[1].chan = 2;
3335				alu.src[2].sel = tmp0;
3336				alu.src[2].chan = 0;
3337
3338				alu.last = 1;
3339				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3340					return r;
3341
3342			} else {
3343
3344				/* tmp0.x = -tmp0.z */
3345				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3346				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3347
3348				alu.dst.sel = tmp0;
3349				alu.dst.chan = 0;
3350				alu.dst.write = 1;
3351
3352				alu.src[0].sel = V_SQ_ALU_SRC_0;
3353				alu.src[1].sel = tmp0;
3354				alu.src[1].chan = 2;
3355
3356				alu.last = 1;
3357				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3358					return r;
3359
3360				/* fix the quotient sign (same as the sign of src0*src1) */
3361				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3362				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3363				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3364				alu.is_op3 = 1;
3365
3366				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3367
3368				alu.src[0].sel = tmp2;
3369				alu.src[0].chan = 2;
3370				alu.src[1].sel = tmp0;
3371				alu.src[1].chan = 2;
3372				alu.src[2].sel = tmp0;
3373				alu.src[2].chan = 0;
3374
3375				alu.last = 1;
3376				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3377					return r;
3378			}
3379		}
3380	}
3381	return 0;
3382}
3383
3384static int tgsi_udiv(struct r600_shader_ctx *ctx)
3385{
3386	return tgsi_divmod(ctx, 0, 0);
3387}
3388
3389static int tgsi_umod(struct r600_shader_ctx *ctx)
3390{
3391	return tgsi_divmod(ctx, 1, 0);
3392}
3393
3394static int tgsi_idiv(struct r600_shader_ctx *ctx)
3395{
3396	return tgsi_divmod(ctx, 0, 1);
3397}
3398
3399static int tgsi_imod(struct r600_shader_ctx *ctx)
3400{
3401	return tgsi_divmod(ctx, 1, 1);
3402}
3403
3404
3405static int tgsi_f2i(struct r600_shader_ctx *ctx)
3406{
3407	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3408	struct r600_bytecode_alu alu;
3409	int i, r;
3410	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3411	int last_inst = tgsi_last_instruction(write_mask);
3412
3413	for (i = 0; i < 4; i++) {
3414		if (!(write_mask & (1<<i)))
3415			continue;
3416
3417		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3418		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
3419
3420		alu.dst.sel = ctx->temp_reg;
3421		alu.dst.chan = i;
3422		alu.dst.write = 1;
3423
3424		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3425		if (i == last_inst)
3426			alu.last = 1;
3427		r = r600_bytecode_add_alu(ctx->bc, &alu);
3428		if (r)
3429			return r;
3430	}
3431
3432	for (i = 0; i < 4; i++) {
3433		if (!(write_mask & (1<<i)))
3434			continue;
3435
3436		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3437		alu.inst = ctx->inst_info->r600_opcode;
3438
3439		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3440
3441		alu.src[0].sel = ctx->temp_reg;
3442		alu.src[0].chan = i;
3443
3444		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
3445			alu.last = 1;
3446		r = r600_bytecode_add_alu(ctx->bc, &alu);
3447		if (r)
3448			return r;
3449	}
3450
3451	return 0;
3452}
3453
3454static int tgsi_iabs(struct r600_shader_ctx *ctx)
3455{
3456	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3457	struct r600_bytecode_alu alu;
3458	int i, r;
3459	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3460	int last_inst = tgsi_last_instruction(write_mask);
3461
3462	/* tmp = -src */
3463	for (i = 0; i < 4; i++) {
3464		if (!(write_mask & (1<<i)))
3465			continue;
3466
3467		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3468		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
3469
3470		alu.dst.sel = ctx->temp_reg;
3471		alu.dst.chan = i;
3472		alu.dst.write = 1;
3473
3474		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3475		alu.src[0].sel = V_SQ_ALU_SRC_0;
3476
3477		if (i == last_inst)
3478			alu.last = 1;
3479		r = r600_bytecode_add_alu(ctx->bc, &alu);
3480		if (r)
3481			return r;
3482	}
3483
3484	/* dst = (src >= 0 ? src : tmp) */
3485	for (i = 0; i < 4; i++) {
3486		if (!(write_mask & (1<<i)))
3487			continue;
3488
3489		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3490		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3491		alu.is_op3 = 1;
3492		alu.dst.write = 1;
3493
3494		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3495
3496		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3497		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3498		alu.src[2].sel = ctx->temp_reg;
3499		alu.src[2].chan = i;
3500
3501		if (i == last_inst)
3502			alu.last = 1;
3503		r = r600_bytecode_add_alu(ctx->bc, &alu);
3504		if (r)
3505			return r;
3506	}
3507	return 0;
3508}
3509
3510static int tgsi_issg(struct r600_shader_ctx *ctx)
3511{
3512	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3513	struct r600_bytecode_alu alu;
3514	int i, r;
3515	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3516	int last_inst = tgsi_last_instruction(write_mask);
3517
3518	/* tmp = (src >= 0 ? src : -1) */
3519	for (i = 0; i < 4; i++) {
3520		if (!(write_mask & (1<<i)))
3521			continue;
3522
3523		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3524		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
3525		alu.is_op3 = 1;
3526
3527		alu.dst.sel = ctx->temp_reg;
3528		alu.dst.chan = i;
3529		alu.dst.write = 1;
3530
3531		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3532		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3533		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
3534
3535		if (i == last_inst)
3536			alu.last = 1;
3537		r = r600_bytecode_add_alu(ctx->bc, &alu);
3538		if (r)
3539			return r;
3540	}
3541
3542	/* dst = (tmp > 0 ? 1 : tmp) */
3543	for (i = 0; i < 4; i++) {
3544		if (!(write_mask & (1<<i)))
3545			continue;
3546
3547		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3548		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
3549		alu.is_op3 = 1;
3550		alu.dst.write = 1;
3551
3552		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3553
3554		alu.src[0].sel = ctx->temp_reg;
3555		alu.src[0].chan = i;
3556
3557		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3558
3559		alu.src[2].sel = ctx->temp_reg;
3560		alu.src[2].chan = i;
3561
3562		if (i == last_inst)
3563			alu.last = 1;
3564		r = r600_bytecode_add_alu(ctx->bc, &alu);
3565		if (r)
3566			return r;
3567	}
3568	return 0;
3569}
3570
3571
3572
3573static int tgsi_ssg(struct r600_shader_ctx *ctx)
3574{
3575	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3576	struct r600_bytecode_alu alu;
3577	int i, r;
3578
3579	/* tmp = (src > 0 ? 1 : src) */
3580	for (i = 0; i < 4; i++) {
3581		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3582		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3583		alu.is_op3 = 1;
3584
3585		alu.dst.sel = ctx->temp_reg;
3586		alu.dst.chan = i;
3587
3588		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3589		alu.src[1].sel = V_SQ_ALU_SRC_1;
3590		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
3591
3592		if (i == 3)
3593			alu.last = 1;
3594		r = r600_bytecode_add_alu(ctx->bc, &alu);
3595		if (r)
3596			return r;
3597	}
3598
3599	/* dst = (-tmp > 0 ? -1 : tmp) */
3600	for (i = 0; i < 4; i++) {
3601		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3602		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
3603		alu.is_op3 = 1;
3604		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3605
3606		alu.src[0].sel = ctx->temp_reg;
3607		alu.src[0].chan = i;
3608		alu.src[0].neg = 1;
3609
3610		alu.src[1].sel = V_SQ_ALU_SRC_1;
3611		alu.src[1].neg = 1;
3612
3613		alu.src[2].sel = ctx->temp_reg;
3614		alu.src[2].chan = i;
3615
3616		if (i == 3)
3617			alu.last = 1;
3618		r = r600_bytecode_add_alu(ctx->bc, &alu);
3619		if (r)
3620			return r;
3621	}
3622	return 0;
3623}
3624
3625static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
3626{
3627	struct r600_bytecode_alu alu;
3628	int i, r;
3629
3630	for (i = 0; i < 4; i++) {
3631		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3632		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
3633			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
3634			alu.dst.chan = i;
3635		} else {
3636			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3637			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3638			alu.src[0].sel = ctx->temp_reg;
3639			alu.src[0].chan = i;
3640		}
3641		if (i == 3) {
3642			alu.last = 1;
3643		}
3644		r = r600_bytecode_add_alu(ctx->bc, &alu);
3645		if (r)
3646			return r;
3647	}
3648	return 0;
3649}
3650
3651static int tgsi_op3(struct r600_shader_ctx *ctx)
3652{
3653	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3654	struct r600_bytecode_alu alu;
3655	int i, j, r;
3656	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3657
3658	for (i = 0; i < lasti + 1; i++) {
3659		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3660			continue;
3661
3662		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3663		alu.inst = ctx->inst_info->r600_opcode;
3664		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3665			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3666		}
3667
3668		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3669		alu.dst.chan = i;
3670		alu.dst.write = 1;
3671		alu.is_op3 = 1;
3672		if (i == lasti) {
3673			alu.last = 1;
3674		}
3675		r = r600_bytecode_add_alu(ctx->bc, &alu);
3676		if (r)
3677			return r;
3678	}
3679	return 0;
3680}
3681
3682static int tgsi_dp(struct r600_shader_ctx *ctx)
3683{
3684	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3685	struct r600_bytecode_alu alu;
3686	int i, j, r;
3687
3688	for (i = 0; i < 4; i++) {
3689		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3690		alu.inst = ctx->inst_info->r600_opcode;
3691		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3692			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3693		}
3694
3695		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3696		alu.dst.chan = i;
3697		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3698		/* handle some special cases */
3699		switch (ctx->inst_info->tgsi_opcode) {
3700		case TGSI_OPCODE_DP2:
3701			if (i > 1) {
3702				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3703				alu.src[0].chan = alu.src[1].chan = 0;
3704			}
3705			break;
3706		case TGSI_OPCODE_DP3:
3707			if (i > 2) {
3708				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
3709				alu.src[0].chan = alu.src[1].chan = 0;
3710			}
3711			break;
3712		case TGSI_OPCODE_DPH:
3713			if (i == 3) {
3714				alu.src[0].sel = V_SQ_ALU_SRC_1;
3715				alu.src[0].chan = 0;
3716				alu.src[0].neg = 0;
3717			}
3718			break;
3719		default:
3720			break;
3721		}
3722		if (i == 3) {
3723			alu.last = 1;
3724		}
3725		r = r600_bytecode_add_alu(ctx->bc, &alu);
3726		if (r)
3727			return r;
3728	}
3729	return 0;
3730}
3731
3732static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
3733						    unsigned index)
3734{
3735	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3736	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
3737		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
3738		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
3739		ctx->src[index].neg || ctx->src[index].abs;
3740}
3741
3742static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
3743					unsigned index)
3744{
3745	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3746	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
3747}
3748
3749static int tgsi_tex(struct r600_shader_ctx *ctx)
3750{
3751	static float one_point_five = 1.5f;
3752	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3753	struct r600_bytecode_tex tex;
3754	struct r600_bytecode_alu alu;
3755	unsigned src_gpr;
3756	int r, i, j;
3757	int opcode;
3758	/* Texture fetch instructions can only use gprs as source.
3759	 * Also they cannot negate the source or take the absolute value */
3760	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
3761                                             tgsi_tex_src_requires_loading(ctx, 0);
3762	boolean src_loaded = FALSE;
3763	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
3764	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
3765
3766	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
3767
3768	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
3769		/* get offset values */
3770		if (inst->Texture.NumOffsets) {
3771			assert(inst->Texture.NumOffsets == 1);
3772
3773			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
3774			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
3775			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
3776		}
3777	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
3778		/* TGSI moves the sampler to src reg 3 for TXD */
3779		sampler_src_reg = 3;
3780
3781		for (i = 1; i < 3; i++) {
3782			/* set gradients h/v */
3783			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
3784			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
3785				SQ_TEX_INST_SET_GRADIENTS_V;
3786			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
3787			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
3788
3789			if (tgsi_tex_src_requires_loading(ctx, i)) {
3790				tex.src_gpr = r600_get_temp(ctx);
3791				tex.src_sel_x = 0;
3792				tex.src_sel_y = 1;
3793				tex.src_sel_z = 2;
3794				tex.src_sel_w = 3;
3795
3796				for (j = 0; j < 4; j++) {
3797					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3798					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3799                                        r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
3800                                        alu.dst.sel = tex.src_gpr;
3801                                        alu.dst.chan = j;
3802                                        if (j == 3)
3803                                                alu.last = 1;
3804                                        alu.dst.write = 1;
3805                                        r = r600_bytecode_add_alu(ctx->bc, &alu);
3806                                        if (r)
3807                                                return r;
3808				}
3809
3810			} else {
3811				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
3812				tex.src_sel_x = ctx->src[i].swizzle[0];
3813				tex.src_sel_y = ctx->src[i].swizzle[1];
3814				tex.src_sel_z = ctx->src[i].swizzle[2];
3815				tex.src_sel_w = ctx->src[i].swizzle[3];
3816				tex.src_rel = ctx->src[i].rel;
3817			}
3818			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
3819			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
3820			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
3821				tex.coord_type_x = 1;
3822				tex.coord_type_y = 1;
3823				tex.coord_type_z = 1;
3824				tex.coord_type_w = 1;
3825			}
3826			r = r600_bytecode_add_tex(ctx->bc, &tex);
3827			if (r)
3828				return r;
3829		}
3830	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
3831		int out_chan;
3832		/* Add perspective divide */
3833		if (ctx->bc->chip_class == CAYMAN) {
3834			out_chan = 2;
3835			for (i = 0; i < 3; i++) {
3836				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3837				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3838				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3839
3840				alu.dst.sel = ctx->temp_reg;
3841				alu.dst.chan = i;
3842				if (i == 2)
3843					alu.last = 1;
3844				if (out_chan == i)
3845					alu.dst.write = 1;
3846				r = r600_bytecode_add_alu(ctx->bc, &alu);
3847				if (r)
3848					return r;
3849			}
3850
3851		} else {
3852			out_chan = 3;
3853			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3854			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3855			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
3856
3857			alu.dst.sel = ctx->temp_reg;
3858			alu.dst.chan = out_chan;
3859			alu.last = 1;
3860			alu.dst.write = 1;
3861			r = r600_bytecode_add_alu(ctx->bc, &alu);
3862			if (r)
3863				return r;
3864		}
3865
3866		for (i = 0; i < 3; i++) {
3867			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3868			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
3869			alu.src[0].sel = ctx->temp_reg;
3870			alu.src[0].chan = out_chan;
3871			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3872			alu.dst.sel = ctx->temp_reg;
3873			alu.dst.chan = i;
3874			alu.dst.write = 1;
3875			r = r600_bytecode_add_alu(ctx->bc, &alu);
3876			if (r)
3877				return r;
3878		}
3879		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3880		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
3881		alu.src[0].sel = V_SQ_ALU_SRC_1;
3882		alu.src[0].chan = 0;
3883		alu.dst.sel = ctx->temp_reg;
3884		alu.dst.chan = 3;
3885		alu.last = 1;
3886		alu.dst.write = 1;
3887		r = r600_bytecode_add_alu(ctx->bc, &alu);
3888		if (r)
3889			return r;
3890		src_loaded = TRUE;
3891		src_gpr = ctx->temp_reg;
3892	}
3893
3894	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
3895	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
3896	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
3897	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
3898
3899		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
3900		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
3901
3902		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
3903		for (i = 0; i < 4; i++) {
3904			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3905			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
3906			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
3907			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
3908			alu.dst.sel = ctx->temp_reg;
3909			alu.dst.chan = i;
3910			if (i == 3)
3911				alu.last = 1;
3912			alu.dst.write = 1;
3913			r = r600_bytecode_add_alu(ctx->bc, &alu);
3914			if (r)
3915				return r;
3916		}
3917
3918		/* tmp1.z = RCP_e(|tmp1.z|) */
3919		if (ctx->bc->chip_class == CAYMAN) {
3920			for (i = 0; i < 3; i++) {
3921				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3922				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3923				alu.src[0].sel = ctx->temp_reg;
3924				alu.src[0].chan = 2;
3925				alu.src[0].abs = 1;
3926				alu.dst.sel = ctx->temp_reg;
3927				alu.dst.chan = i;
3928				if (i == 2)
3929					alu.dst.write = 1;
3930				if (i == 2)
3931					alu.last = 1;
3932				r = r600_bytecode_add_alu(ctx->bc, &alu);
3933				if (r)
3934					return r;
3935			}
3936		} else {
3937			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3938			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
3939			alu.src[0].sel = ctx->temp_reg;
3940			alu.src[0].chan = 2;
3941			alu.src[0].abs = 1;
3942			alu.dst.sel = ctx->temp_reg;
3943			alu.dst.chan = 2;
3944			alu.dst.write = 1;
3945			alu.last = 1;
3946			r = r600_bytecode_add_alu(ctx->bc, &alu);
3947			if (r)
3948				return r;
3949		}
3950
3951		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
3952		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
3953		 * muladd has no writemask, have to use another temp
3954		 */
3955		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3956		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3957		alu.is_op3 = 1;
3958
3959		alu.src[0].sel = ctx->temp_reg;
3960		alu.src[0].chan = 0;
3961		alu.src[1].sel = ctx->temp_reg;
3962		alu.src[1].chan = 2;
3963
3964		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3965		alu.src[2].chan = 0;
3966		alu.src[2].value = *(uint32_t *)&one_point_five;
3967
3968		alu.dst.sel = ctx->temp_reg;
3969		alu.dst.chan = 0;
3970		alu.dst.write = 1;
3971
3972		r = r600_bytecode_add_alu(ctx->bc, &alu);
3973		if (r)
3974			return r;
3975
3976		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3977		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
3978		alu.is_op3 = 1;
3979
3980		alu.src[0].sel = ctx->temp_reg;
3981		alu.src[0].chan = 1;
3982		alu.src[1].sel = ctx->temp_reg;
3983		alu.src[1].chan = 2;
3984
3985		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3986		alu.src[2].chan = 0;
3987		alu.src[2].value = *(uint32_t *)&one_point_five;
3988
3989		alu.dst.sel = ctx->temp_reg;
3990		alu.dst.chan = 1;
3991		alu.dst.write = 1;
3992
3993		alu.last = 1;
3994		r = r600_bytecode_add_alu(ctx->bc, &alu);
3995		if (r)
3996			return r;
3997		/* write initial W value into Z component */
3998		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
3999			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4000			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4001			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4002			alu.dst.sel = ctx->temp_reg;
4003			alu.dst.chan = 2;
4004			alu.dst.write = 1;
4005			alu.last = 1;
4006			r = r600_bytecode_add_alu(ctx->bc, &alu);
4007			if (r)
4008				return r;
4009		}
4010		src_loaded = TRUE;
4011		src_gpr = ctx->temp_reg;
4012	}
4013
4014	if (src_requires_loading && !src_loaded) {
4015		for (i = 0; i < 4; i++) {
4016			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4017			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4018			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4019			alu.dst.sel = ctx->temp_reg;
4020			alu.dst.chan = i;
4021			if (i == 3)
4022				alu.last = 1;
4023			alu.dst.write = 1;
4024			r = r600_bytecode_add_alu(ctx->bc, &alu);
4025			if (r)
4026				return r;
4027		}
4028		src_loaded = TRUE;
4029		src_gpr = ctx->temp_reg;
4030	}
4031
4032	opcode = ctx->inst_info->r600_opcode;
4033	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4034	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4035	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4036	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4037	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
4038	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
4039		switch (opcode) {
4040		case SQ_TEX_INST_SAMPLE:
4041			opcode = SQ_TEX_INST_SAMPLE_C;
4042			break;
4043		case SQ_TEX_INST_SAMPLE_L:
4044			opcode = SQ_TEX_INST_SAMPLE_C_L;
4045			break;
4046		case SQ_TEX_INST_SAMPLE_LB:
4047			opcode = SQ_TEX_INST_SAMPLE_C_LB;
4048			break;
4049		case SQ_TEX_INST_SAMPLE_G:
4050			opcode = SQ_TEX_INST_SAMPLE_C_G;
4051			break;
4052		}
4053	}
4054
4055	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4056	tex.inst = opcode;
4057
4058	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4059	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4060	tex.src_gpr = src_gpr;
4061	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4062	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
4063	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
4064	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
4065	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
4066
4067	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
4068		tex.src_sel_x = 4;
4069		tex.src_sel_y = 4;
4070		tex.src_sel_z = 4;
4071		tex.src_sel_w = 4;
4072	} else if (src_loaded) {
4073		tex.src_sel_x = 0;
4074		tex.src_sel_y = 1;
4075		tex.src_sel_z = 2;
4076		tex.src_sel_w = 3;
4077	} else {
4078		tex.src_sel_x = ctx->src[0].swizzle[0];
4079		tex.src_sel_y = ctx->src[0].swizzle[1];
4080		tex.src_sel_z = ctx->src[0].swizzle[2];
4081		tex.src_sel_w = ctx->src[0].swizzle[3];
4082		tex.src_rel = ctx->src[0].rel;
4083	}
4084
4085	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
4086		tex.src_sel_x = 1;
4087		tex.src_sel_y = 0;
4088		tex.src_sel_z = 3;
4089		tex.src_sel_w = 1;
4090	}
4091	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
4092		tex.src_sel_x = 1;
4093		tex.src_sel_y = 0;
4094		tex.src_sel_z = 3;
4095		tex.src_sel_w = 2; /* route Z compare value into W */
4096	}
4097
4098	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
4099	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
4100		tex.coord_type_x = 1;
4101		tex.coord_type_y = 1;
4102	}
4103	tex.coord_type_z = 1;
4104	tex.coord_type_w = 1;
4105
4106	tex.offset_x = offset_x;
4107	tex.offset_y = offset_y;
4108	tex.offset_z = offset_z;
4109
4110	/* Put the depth for comparison in W.
4111	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
4112	 * Some instructions expect the depth in Z. */
4113	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
4114	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
4115	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
4116	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
4117	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
4118	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
4119		tex.src_sel_w = tex.src_sel_z;
4120	}
4121
4122	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
4123	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4124		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
4125		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
4126			/* the array index is read from Y */
4127			tex.coord_type_y = 0;
4128		} else {
4129			/* the array index is read from Z */
4130			tex.coord_type_z = 0;
4131			tex.src_sel_z = tex.src_sel_y;
4132		}
4133	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
4134		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
4135		/* the array index is read from Z */
4136		tex.coord_type_z = 0;
4137
4138	r = r600_bytecode_add_tex(ctx->bc, &tex);
4139	if (r)
4140		return r;
4141
4142	/* add shadow ambient support  - gallium doesn't do it yet */
4143	return 0;
4144}
4145
4146static int tgsi_lrp(struct r600_shader_ctx *ctx)
4147{
4148	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4149	struct r600_bytecode_alu alu;
4150	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4151	unsigned i;
4152	int r;
4153
4154	/* optimize if it's just an equal balance */
4155	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
4156		for (i = 0; i < lasti + 1; i++) {
4157			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4158				continue;
4159
4160			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4161			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4162			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4163			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4164			alu.omod = 3;
4165			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4166			alu.dst.chan = i;
4167			if (i == lasti) {
4168				alu.last = 1;
4169			}
4170			r = r600_bytecode_add_alu(ctx->bc, &alu);
4171			if (r)
4172				return r;
4173		}
4174		return 0;
4175	}
4176
4177	/* 1 - src0 */
4178	for (i = 0; i < lasti + 1; i++) {
4179		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4180			continue;
4181
4182		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4183		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
4184		alu.src[0].sel = V_SQ_ALU_SRC_1;
4185		alu.src[0].chan = 0;
4186		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4187		r600_bytecode_src_toggle_neg(&alu.src[1]);
4188		alu.dst.sel = ctx->temp_reg;
4189		alu.dst.chan = i;
4190		if (i == lasti) {
4191			alu.last = 1;
4192		}
4193		alu.dst.write = 1;
4194		r = r600_bytecode_add_alu(ctx->bc, &alu);
4195		if (r)
4196			return r;
4197	}
4198
4199	/* (1 - src0) * src2 */
4200	for (i = 0; i < lasti + 1; i++) {
4201		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4202			continue;
4203
4204		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4205		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4206		alu.src[0].sel = ctx->temp_reg;
4207		alu.src[0].chan = i;
4208		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4209		alu.dst.sel = ctx->temp_reg;
4210		alu.dst.chan = i;
4211		if (i == lasti) {
4212			alu.last = 1;
4213		}
4214		alu.dst.write = 1;
4215		r = r600_bytecode_add_alu(ctx->bc, &alu);
4216		if (r)
4217			return r;
4218	}
4219
4220	/* src0 * src1 + (1 - src0) * src2 */
4221	for (i = 0; i < lasti + 1; i++) {
4222		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4223			continue;
4224
4225		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4226		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4227		alu.is_op3 = 1;
4228		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4229		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4230		alu.src[2].sel = ctx->temp_reg;
4231		alu.src[2].chan = i;
4232
4233		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4234		alu.dst.chan = i;
4235		if (i == lasti) {
4236			alu.last = 1;
4237		}
4238		r = r600_bytecode_add_alu(ctx->bc, &alu);
4239		if (r)
4240			return r;
4241	}
4242	return 0;
4243}
4244
4245static int tgsi_cmp(struct r600_shader_ctx *ctx)
4246{
4247	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4248	struct r600_bytecode_alu alu;
4249	int i, r;
4250	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4251
4252	for (i = 0; i < lasti + 1; i++) {
4253		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4254			continue;
4255
4256		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4257		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
4258		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4259		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4260		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
4261		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4262		alu.dst.chan = i;
4263		alu.dst.write = 1;
4264		alu.is_op3 = 1;
4265		if (i == lasti)
4266			alu.last = 1;
4267		r = r600_bytecode_add_alu(ctx->bc, &alu);
4268		if (r)
4269			return r;
4270	}
4271	return 0;
4272}
4273
4274static int tgsi_xpd(struct r600_shader_ctx *ctx)
4275{
4276	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4277	static const unsigned int src0_swizzle[] = {2, 0, 1};
4278	static const unsigned int src1_swizzle[] = {1, 2, 0};
4279	struct r600_bytecode_alu alu;
4280	uint32_t use_temp = 0;
4281	int i, r;
4282
4283	if (inst->Dst[0].Register.WriteMask != 0xf)
4284		use_temp = 1;
4285
4286	for (i = 0; i < 4; i++) {
4287		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4288		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4289		if (i < 3) {
4290			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4291			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
4292		} else {
4293			alu.src[0].sel = V_SQ_ALU_SRC_0;
4294			alu.src[0].chan = i;
4295			alu.src[1].sel = V_SQ_ALU_SRC_0;
4296			alu.src[1].chan = i;
4297		}
4298
4299		alu.dst.sel = ctx->temp_reg;
4300		alu.dst.chan = i;
4301		alu.dst.write = 1;
4302
4303		if (i == 3)
4304			alu.last = 1;
4305		r = r600_bytecode_add_alu(ctx->bc, &alu);
4306		if (r)
4307			return r;
4308	}
4309
4310	for (i = 0; i < 4; i++) {
4311		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4312		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
4313
4314		if (i < 3) {
4315			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
4316			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
4317		} else {
4318			alu.src[0].sel = V_SQ_ALU_SRC_0;
4319			alu.src[0].chan = i;
4320			alu.src[1].sel = V_SQ_ALU_SRC_0;
4321			alu.src[1].chan = i;
4322		}
4323
4324		alu.src[2].sel = ctx->temp_reg;
4325		alu.src[2].neg = 1;
4326		alu.src[2].chan = i;
4327
4328		if (use_temp)
4329			alu.dst.sel = ctx->temp_reg;
4330		else
4331			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4332		alu.dst.chan = i;
4333		alu.dst.write = 1;
4334		alu.is_op3 = 1;
4335		if (i == 3)
4336			alu.last = 1;
4337		r = r600_bytecode_add_alu(ctx->bc, &alu);
4338		if (r)
4339			return r;
4340	}
4341	if (use_temp)
4342		return tgsi_helper_copy(ctx, inst);
4343	return 0;
4344}
4345
4346static int tgsi_exp(struct r600_shader_ctx *ctx)
4347{
4348	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4349	struct r600_bytecode_alu alu;
4350	int r;
4351	int i;
4352
4353	/* result.x = 2^floor(src); */
4354	if (inst->Dst[0].Register.WriteMask & 1) {
4355		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4356
4357		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4358		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4359
4360		alu.dst.sel = ctx->temp_reg;
4361		alu.dst.chan = 0;
4362		alu.dst.write = 1;
4363		alu.last = 1;
4364		r = r600_bytecode_add_alu(ctx->bc, &alu);
4365		if (r)
4366			return r;
4367
4368		if (ctx->bc->chip_class == CAYMAN) {
4369			for (i = 0; i < 3; i++) {
4370				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4371				alu.src[0].sel = ctx->temp_reg;
4372				alu.src[0].chan = 0;
4373
4374				alu.dst.sel = ctx->temp_reg;
4375				alu.dst.chan = i;
4376				if (i == 0)
4377					alu.dst.write = 1;
4378				if (i == 2)
4379					alu.last = 1;
4380				r = r600_bytecode_add_alu(ctx->bc, &alu);
4381				if (r)
4382					return r;
4383			}
4384		} else {
4385			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4386			alu.src[0].sel = ctx->temp_reg;
4387			alu.src[0].chan = 0;
4388
4389			alu.dst.sel = ctx->temp_reg;
4390			alu.dst.chan = 0;
4391			alu.dst.write = 1;
4392			alu.last = 1;
4393			r = r600_bytecode_add_alu(ctx->bc, &alu);
4394			if (r)
4395				return r;
4396		}
4397	}
4398
4399	/* result.y = tmp - floor(tmp); */
4400	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4401		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4402
4403		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
4404		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4405
4406		alu.dst.sel = ctx->temp_reg;
4407#if 0
4408		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4409		if (r)
4410			return r;
4411#endif
4412		alu.dst.write = 1;
4413		alu.dst.chan = 1;
4414
4415		alu.last = 1;
4416
4417		r = r600_bytecode_add_alu(ctx->bc, &alu);
4418		if (r)
4419			return r;
4420	}
4421
4422	/* result.z = RoughApprox2ToX(tmp);*/
4423	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
4424		if (ctx->bc->chip_class == CAYMAN) {
4425			for (i = 0; i < 3; i++) {
4426				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4427				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4428				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4429
4430				alu.dst.sel = ctx->temp_reg;
4431				alu.dst.chan = i;
4432				if (i == 2) {
4433					alu.dst.write = 1;
4434					alu.last = 1;
4435				}
4436
4437				r = r600_bytecode_add_alu(ctx->bc, &alu);
4438				if (r)
4439					return r;
4440			}
4441		} else {
4442			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4443			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4444			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4445
4446			alu.dst.sel = ctx->temp_reg;
4447			alu.dst.write = 1;
4448			alu.dst.chan = 2;
4449
4450			alu.last = 1;
4451
4452			r = r600_bytecode_add_alu(ctx->bc, &alu);
4453			if (r)
4454				return r;
4455		}
4456	}
4457
4458	/* result.w = 1.0;*/
4459	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
4460		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4461
4462		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4463		alu.src[0].sel = V_SQ_ALU_SRC_1;
4464		alu.src[0].chan = 0;
4465
4466		alu.dst.sel = ctx->temp_reg;
4467		alu.dst.chan = 3;
4468		alu.dst.write = 1;
4469		alu.last = 1;
4470		r = r600_bytecode_add_alu(ctx->bc, &alu);
4471		if (r)
4472			return r;
4473	}
4474	return tgsi_helper_copy(ctx, inst);
4475}
4476
4477static int tgsi_log(struct r600_shader_ctx *ctx)
4478{
4479	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4480	struct r600_bytecode_alu alu;
4481	int r;
4482	int i;
4483
4484	/* result.x = floor(log2(|src|)); */
4485	if (inst->Dst[0].Register.WriteMask & 1) {
4486		if (ctx->bc->chip_class == CAYMAN) {
4487			for (i = 0; i < 3; i++) {
4488				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4489
4490				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4491				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4492				r600_bytecode_src_set_abs(&alu.src[0]);
4493
4494				alu.dst.sel = ctx->temp_reg;
4495				alu.dst.chan = i;
4496				if (i == 0)
4497					alu.dst.write = 1;
4498				if (i == 2)
4499					alu.last = 1;
4500				r = r600_bytecode_add_alu(ctx->bc, &alu);
4501				if (r)
4502					return r;
4503			}
4504
4505		} else {
4506			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4507
4508			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4509			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4510			r600_bytecode_src_set_abs(&alu.src[0]);
4511
4512			alu.dst.sel = ctx->temp_reg;
4513			alu.dst.chan = 0;
4514			alu.dst.write = 1;
4515			alu.last = 1;
4516			r = r600_bytecode_add_alu(ctx->bc, &alu);
4517			if (r)
4518				return r;
4519		}
4520
4521		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4522		alu.src[0].sel = ctx->temp_reg;
4523		alu.src[0].chan = 0;
4524
4525		alu.dst.sel = ctx->temp_reg;
4526		alu.dst.chan = 0;
4527		alu.dst.write = 1;
4528		alu.last = 1;
4529
4530		r = r600_bytecode_add_alu(ctx->bc, &alu);
4531		if (r)
4532			return r;
4533	}
4534
4535	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
4536	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
4537
4538		if (ctx->bc->chip_class == CAYMAN) {
4539			for (i = 0; i < 3; i++) {
4540				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4541
4542				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4543				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4544				r600_bytecode_src_set_abs(&alu.src[0]);
4545
4546				alu.dst.sel = ctx->temp_reg;
4547				alu.dst.chan = i;
4548				if (i == 1)
4549					alu.dst.write = 1;
4550				if (i == 2)
4551					alu.last = 1;
4552
4553				r = r600_bytecode_add_alu(ctx->bc, &alu);
4554				if (r)
4555					return r;
4556			}
4557		} else {
4558			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4559
4560			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4561			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4562			r600_bytecode_src_set_abs(&alu.src[0]);
4563
4564			alu.dst.sel = ctx->temp_reg;
4565			alu.dst.chan = 1;
4566			alu.dst.write = 1;
4567			alu.last = 1;
4568
4569			r = r600_bytecode_add_alu(ctx->bc, &alu);
4570			if (r)
4571				return r;
4572		}
4573
4574		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4575
4576		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
4577		alu.src[0].sel = ctx->temp_reg;
4578		alu.src[0].chan = 1;
4579
4580		alu.dst.sel = ctx->temp_reg;
4581		alu.dst.chan = 1;
4582		alu.dst.write = 1;
4583		alu.last = 1;
4584
4585		r = r600_bytecode_add_alu(ctx->bc, &alu);
4586		if (r)
4587			return r;
4588
4589		if (ctx->bc->chip_class == CAYMAN) {
4590			for (i = 0; i < 3; i++) {
4591				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4592				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4593				alu.src[0].sel = ctx->temp_reg;
4594				alu.src[0].chan = 1;
4595
4596				alu.dst.sel = ctx->temp_reg;
4597				alu.dst.chan = i;
4598				if (i == 1)
4599					alu.dst.write = 1;
4600				if (i == 2)
4601					alu.last = 1;
4602
4603				r = r600_bytecode_add_alu(ctx->bc, &alu);
4604				if (r)
4605					return r;
4606			}
4607		} else {
4608			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4609			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
4610			alu.src[0].sel = ctx->temp_reg;
4611			alu.src[0].chan = 1;
4612
4613			alu.dst.sel = ctx->temp_reg;
4614			alu.dst.chan = 1;
4615			alu.dst.write = 1;
4616			alu.last = 1;
4617
4618			r = r600_bytecode_add_alu(ctx->bc, &alu);
4619			if (r)
4620				return r;
4621		}
4622
4623		if (ctx->bc->chip_class == CAYMAN) {
4624			for (i = 0; i < 3; i++) {
4625				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4626				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4627				alu.src[0].sel = ctx->temp_reg;
4628				alu.src[0].chan = 1;
4629
4630				alu.dst.sel = ctx->temp_reg;
4631				alu.dst.chan = i;
4632				if (i == 1)
4633					alu.dst.write = 1;
4634				if (i == 2)
4635					alu.last = 1;
4636
4637				r = r600_bytecode_add_alu(ctx->bc, &alu);
4638				if (r)
4639					return r;
4640			}
4641		} else {
4642			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4643			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
4644			alu.src[0].sel = ctx->temp_reg;
4645			alu.src[0].chan = 1;
4646
4647			alu.dst.sel = ctx->temp_reg;
4648			alu.dst.chan = 1;
4649			alu.dst.write = 1;
4650			alu.last = 1;
4651
4652			r = r600_bytecode_add_alu(ctx->bc, &alu);
4653			if (r)
4654				return r;
4655		}
4656
4657		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4658
4659		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4660
4661		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4662		r600_bytecode_src_set_abs(&alu.src[0]);
4663
4664		alu.src[1].sel = ctx->temp_reg;
4665		alu.src[1].chan = 1;
4666
4667		alu.dst.sel = ctx->temp_reg;
4668		alu.dst.chan = 1;
4669		alu.dst.write = 1;
4670		alu.last = 1;
4671
4672		r = r600_bytecode_add_alu(ctx->bc, &alu);
4673		if (r)
4674			return r;
4675	}
4676
4677	/* result.z = log2(|src|);*/
4678	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
4679		if (ctx->bc->chip_class == CAYMAN) {
4680			for (i = 0; i < 3; i++) {
4681				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4682
4683				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4684				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4685				r600_bytecode_src_set_abs(&alu.src[0]);
4686
4687				alu.dst.sel = ctx->temp_reg;
4688				if (i == 2)
4689					alu.dst.write = 1;
4690				alu.dst.chan = i;
4691				if (i == 2)
4692					alu.last = 1;
4693
4694				r = r600_bytecode_add_alu(ctx->bc, &alu);
4695				if (r)
4696					return r;
4697			}
4698		} else {
4699			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4700
4701			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
4702			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4703			r600_bytecode_src_set_abs(&alu.src[0]);
4704
4705			alu.dst.sel = ctx->temp_reg;
4706			alu.dst.write = 1;
4707			alu.dst.chan = 2;
4708			alu.last = 1;
4709
4710			r = r600_bytecode_add_alu(ctx->bc, &alu);
4711			if (r)
4712				return r;
4713		}
4714	}
4715
4716	/* result.w = 1.0; */
4717	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
4718		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4719
4720		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
4721		alu.src[0].sel = V_SQ_ALU_SRC_1;
4722		alu.src[0].chan = 0;
4723
4724		alu.dst.sel = ctx->temp_reg;
4725		alu.dst.chan = 3;
4726		alu.dst.write = 1;
4727		alu.last = 1;
4728
4729		r = r600_bytecode_add_alu(ctx->bc, &alu);
4730		if (r)
4731			return r;
4732	}
4733
4734	return tgsi_helper_copy(ctx, inst);
4735}
4736
4737static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
4738{
4739	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4740	struct r600_bytecode_alu alu;
4741	int r;
4742
4743	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4744
4745	switch (inst->Instruction.Opcode) {
4746	case TGSI_OPCODE_ARL:
4747		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
4748		break;
4749	case TGSI_OPCODE_ARR:
4750		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4751		break;
4752	case TGSI_OPCODE_UARL:
4753		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4754		break;
4755	default:
4756		assert(0);
4757		return -1;
4758	}
4759
4760	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4761	alu.last = 1;
4762	alu.dst.sel = ctx->bc->ar_reg;
4763	alu.dst.write = 1;
4764	r = r600_bytecode_add_alu(ctx->bc, &alu);
4765	if (r)
4766		return r;
4767
4768	ctx->bc->ar_loaded = 0;
4769	return 0;
4770}
4771static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
4772{
4773	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4774	struct r600_bytecode_alu alu;
4775	int r;
4776
4777	switch (inst->Instruction.Opcode) {
4778	case TGSI_OPCODE_ARL:
4779		memset(&alu, 0, sizeof(alu));
4780		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
4781		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4782		alu.dst.sel = ctx->bc->ar_reg;
4783		alu.dst.write = 1;
4784		alu.last = 1;
4785
4786		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4787			return r;
4788
4789		memset(&alu, 0, sizeof(alu));
4790		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4791		alu.src[0].sel = ctx->bc->ar_reg;
4792		alu.dst.sel = ctx->bc->ar_reg;
4793		alu.dst.write = 1;
4794		alu.last = 1;
4795
4796		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4797			return r;
4798		break;
4799	case TGSI_OPCODE_ARR:
4800		memset(&alu, 0, sizeof(alu));
4801		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
4802		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4803		alu.dst.sel = ctx->bc->ar_reg;
4804		alu.dst.write = 1;
4805		alu.last = 1;
4806
4807		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4808			return r;
4809		break;
4810	case TGSI_OPCODE_UARL:
4811		memset(&alu, 0, sizeof(alu));
4812		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
4813		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4814		alu.dst.sel = ctx->bc->ar_reg;
4815		alu.dst.write = 1;
4816		alu.last = 1;
4817
4818		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4819			return r;
4820		break;
4821	default:
4822		assert(0);
4823		return -1;
4824	}
4825
4826	ctx->bc->ar_loaded = 0;
4827	return 0;
4828}
4829
4830static int tgsi_opdst(struct r600_shader_ctx *ctx)
4831{
4832	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4833	struct r600_bytecode_alu alu;
4834	int i, r = 0;
4835
4836	for (i = 0; i < 4; i++) {
4837		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4838
4839		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
4840		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4841
4842		if (i == 0 || i == 3) {
4843			alu.src[0].sel = V_SQ_ALU_SRC_1;
4844		} else {
4845			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4846		}
4847
4848		if (i == 0 || i == 2) {
4849			alu.src[1].sel = V_SQ_ALU_SRC_1;
4850		} else {
4851			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4852		}
4853		if (i == 3)
4854			alu.last = 1;
4855		r = r600_bytecode_add_alu(ctx->bc, &alu);
4856		if (r)
4857			return r;
4858	}
4859	return 0;
4860}
4861
4862static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
4863{
4864	struct r600_bytecode_alu alu;
4865	int r;
4866
4867	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4868	alu.inst = opcode;
4869	alu.execute_mask = 1;
4870	alu.update_pred = 1;
4871
4872	alu.dst.sel = ctx->temp_reg;
4873	alu.dst.write = 1;
4874	alu.dst.chan = 0;
4875
4876	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4877	alu.src[1].sel = V_SQ_ALU_SRC_0;
4878	alu.src[1].chan = 0;
4879
4880	alu.last = 1;
4881
4882	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
4883	if (r)
4884		return r;
4885	return 0;
4886}
4887
4888static int pops(struct r600_shader_ctx *ctx, int pops)
4889{
4890	unsigned force_pop = ctx->bc->force_add_cf;
4891
4892	if (!force_pop) {
4893		int alu_pop = 3;
4894		if (ctx->bc->cf_last) {
4895			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
4896				alu_pop = 0;
4897			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
4898				alu_pop = 1;
4899		}
4900		alu_pop += pops;
4901		if (alu_pop == 1) {
4902			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
4903			ctx->bc->force_add_cf = 1;
4904		} else if (alu_pop == 2) {
4905			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
4906			ctx->bc->force_add_cf = 1;
4907		} else {
4908			force_pop = 1;
4909		}
4910	}
4911
4912	if (force_pop) {
4913		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
4914		ctx->bc->cf_last->pop_count = pops;
4915		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
4916	}
4917
4918	return 0;
4919}
4920
4921static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
4922{
4923	switch(reason) {
4924	case FC_PUSH_VPM:
4925		ctx->bc->callstack[ctx->bc->call_sp].current--;
4926		break;
4927	case FC_PUSH_WQM:
4928	case FC_LOOP:
4929		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
4930		break;
4931	case FC_REP:
4932		/* TOODO : for 16 vp asic should -= 2; */
4933		ctx->bc->callstack[ctx->bc->call_sp].current --;
4934		break;
4935	}
4936}
4937
4938static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
4939{
4940	if (check_max_only) {
4941		int diff;
4942		switch (reason) {
4943		case FC_PUSH_VPM:
4944			diff = 1;
4945			break;
4946		case FC_PUSH_WQM:
4947			diff = 4;
4948			break;
4949		default:
4950			assert(0);
4951			diff = 0;
4952		}
4953		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
4954		    ctx->bc->callstack[ctx->bc->call_sp].max) {
4955			ctx->bc->callstack[ctx->bc->call_sp].max =
4956				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
4957		}
4958		return;
4959	}
4960	switch (reason) {
4961	case FC_PUSH_VPM:
4962		ctx->bc->callstack[ctx->bc->call_sp].current++;
4963		break;
4964	case FC_PUSH_WQM:
4965	case FC_LOOP:
4966		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
4967		break;
4968	case FC_REP:
4969		ctx->bc->callstack[ctx->bc->call_sp].current++;
4970		break;
4971	}
4972
4973	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
4974	    ctx->bc->callstack[ctx->bc->call_sp].max) {
4975		ctx->bc->callstack[ctx->bc->call_sp].max =
4976			ctx->bc->callstack[ctx->bc->call_sp].current;
4977	}
4978}
4979
4980static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
4981{
4982	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
4983
4984	sp->mid = (struct r600_bytecode_cf **)realloc((void *)sp->mid,
4985						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
4986	sp->mid[sp->num_mid] = ctx->bc->cf_last;
4987	sp->num_mid++;
4988}
4989
4990static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
4991{
4992	ctx->bc->fc_sp++;
4993	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
4994	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
4995}
4996
4997static void fc_poplevel(struct r600_shader_ctx *ctx)
4998{
4999	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
5000	if (sp->mid) {
5001		free(sp->mid);
5002		sp->mid = NULL;
5003	}
5004	sp->num_mid = 0;
5005	sp->start = NULL;
5006	sp->type = 0;
5007	ctx->bc->fc_sp--;
5008}
5009
5010#if 0
5011static int emit_return(struct r600_shader_ctx *ctx)
5012{
5013	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
5014	return 0;
5015}
5016
5017static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
5018{
5019
5020	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5021	ctx->bc->cf_last->pop_count = pops;
5022	/* XXX work out offset */
5023	return 0;
5024}
5025
5026static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
5027{
5028	return 0;
5029}
5030
5031static void emit_testflag(struct r600_shader_ctx *ctx)
5032{
5033
5034}
5035
5036static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
5037{
5038	emit_testflag(ctx);
5039	emit_jump_to_offset(ctx, 1, 4);
5040	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
5041	pops(ctx, ifidx + 1);
5042	emit_return(ctx);
5043}
5044
5045static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
5046{
5047	emit_testflag(ctx);
5048
5049	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5050	ctx->bc->cf_last->pop_count = 1;
5051
5052	fc_set_mid(ctx, fc_sp);
5053
5054	pops(ctx, 1);
5055}
5056#endif
5057
5058static int tgsi_if(struct r600_shader_ctx *ctx)
5059{
5060	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
5061
5062	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
5063
5064	fc_pushlevel(ctx, FC_IF);
5065
5066	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
5067	return 0;
5068}
5069
5070static int tgsi_else(struct r600_shader_ctx *ctx)
5071{
5072	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
5073	ctx->bc->cf_last->pop_count = 1;
5074
5075	fc_set_mid(ctx, ctx->bc->fc_sp);
5076	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
5077	return 0;
5078}
5079
5080static int tgsi_endif(struct r600_shader_ctx *ctx)
5081{
5082	pops(ctx, 1);
5083	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
5084		R600_ERR("if/endif unbalanced in shader\n");
5085		return -1;
5086	}
5087
5088	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
5089		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5090		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
5091	} else {
5092		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
5093	}
5094	fc_poplevel(ctx);
5095
5096	callstack_decrease_current(ctx, FC_PUSH_VPM);
5097	return 0;
5098}
5099
5100static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
5101{
5102	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL));
5103
5104	fc_pushlevel(ctx, FC_LOOP);
5105
5106	/* check stack depth */
5107	callstack_check_depth(ctx, FC_LOOP, 0);
5108	return 0;
5109}
5110
5111static int tgsi_endloop(struct r600_shader_ctx *ctx)
5112{
5113	int i;
5114
5115	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
5116
5117	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
5118		R600_ERR("loop/endloop in shader code are not paired.\n");
5119		return -EINVAL;
5120	}
5121
5122	/* fixup loop pointers - from r600isa
5123	   LOOP END points to CF after LOOP START,
5124	   LOOP START point to CF after LOOP END
5125	   BRK/CONT point to LOOP END CF
5126	*/
5127	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
5128
5129	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
5130
5131	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
5132		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
5133	}
5134	/* XXX add LOOPRET support */
5135	fc_poplevel(ctx);
5136	callstack_decrease_current(ctx, FC_LOOP);
5137	return 0;
5138}
5139
5140static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
5141{
5142	unsigned int fscp;
5143
5144	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
5145	{
5146		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
5147			break;
5148	}
5149
5150	if (fscp == 0) {
5151		R600_ERR("Break not inside loop/endloop pair\n");
5152		return -EINVAL;
5153	}
5154
5155	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
5156
5157	fc_set_mid(ctx, fscp);
5158
5159	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
5160	return 0;
5161}
5162
5163static int tgsi_umad(struct r600_shader_ctx *ctx)
5164{
5165	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5166	struct r600_bytecode_alu alu;
5167	int i, j, r;
5168	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5169
5170	/* src0 * src1 */
5171	for (i = 0; i < lasti + 1; i++) {
5172		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5173			continue;
5174
5175		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5176
5177		alu.dst.chan = i;
5178		alu.dst.sel = ctx->temp_reg;
5179		alu.dst.write = 1;
5180
5181		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
5182		for (j = 0; j < 2; j++) {
5183		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5184		}
5185
5186		alu.last = 1;
5187		r = r600_bytecode_add_alu(ctx->bc, &alu);
5188		if (r)
5189			return r;
5190	}
5191
5192
5193	for (i = 0; i < lasti + 1; i++) {
5194		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5195			continue;
5196
5197		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5198		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5199
5200		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
5201
5202		alu.src[0].sel = ctx->temp_reg;
5203		alu.src[0].chan = i;
5204
5205		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5206		if (i == lasti) {
5207			alu.last = 1;
5208		}
5209		r = r600_bytecode_add_alu(ctx->bc, &alu);
5210		if (r)
5211			return r;
5212	}
5213	return 0;
5214}
5215
5216static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
5217	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5218	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5219	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5220
5221	/* XXX:
5222	 * For state trackers other than OpenGL, we'll want to use
5223	 * _RECIP_IEEE instead.
5224	 */
5225	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
5226
5227	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
5228	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5229	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5230	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5231	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5232	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5233	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5234	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5235	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5236	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5237	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5238	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5239	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5240	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5241	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5242	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5243	/* gap */
5244	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5245	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5246	/* gap */
5247	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5248	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5249	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5250	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5251	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5252	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5253	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5254	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5255	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5256	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5257	/* gap */
5258	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5259	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5260	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5261	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5262	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5263	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5264	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5265	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5266	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5267	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5268	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5269	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5270	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5271	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5272	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5273	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5274	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5275	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5276	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5277	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5278	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5279	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5280	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5281	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5282	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5283	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5284	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5285	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5286	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5287	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
5288	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5289	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5290	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5291	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5292	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5293	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5294	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5295	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5296	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5297	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5298	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5299	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5300	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5301	/* gap */
5302	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5303	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5304	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5305	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5306	/* gap */
5307	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5308	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5309	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5310	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5311	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5312	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5313	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5314	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5315	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
5316	/* gap */
5317	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5318	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5319	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5320	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5321	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5322	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5323	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5324	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5325	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5326	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5327	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5328	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5329	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5330	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5331	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5332	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5333	/* gap */
5334	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5335	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5336	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5337	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5338	/* gap */
5339	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5340	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5341	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5342	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5343	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5344	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5345	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5346	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5347	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5348	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5349	/* gap */
5350	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5351	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
5352	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5353	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5354	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5355	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5356	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5357	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
5358	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5359	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5360	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5361	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5362	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5363	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5364	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5365	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5366	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5367	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5368	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5369	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5370	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
5371	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5372	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
5373	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5374	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5375	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5376	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5377	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5378	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
5379	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
5380	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5381	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5382	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5383	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5384	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5385	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5386	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5387	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5388	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5389	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
5390	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5391	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5392	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5393	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5394};
5395
5396static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
5397	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5398	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5399	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5400	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
5401	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
5402	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5403	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5404	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5405	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5406	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5407	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5408	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5409	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5410	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5411	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5412	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5413	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5414	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5415	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5416	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5417	/* gap */
5418	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5419	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5420	/* gap */
5421	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5422	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5423	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5424	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5425	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5426	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5427	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
5428	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
5429	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
5430	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5431	/* gap */
5432	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5433	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5434	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5435	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5436	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
5437	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5438	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5439	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5440	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5441	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5442	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5443	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5444	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5445	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5446	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5447	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5448	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
5449	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5450	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5451	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5452	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5453	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5454	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5455	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5456	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5457	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5458	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5459	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5460	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5461	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5462	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5463	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5464	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5465	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5466	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5467	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5468	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5469	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5470	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5471	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5472	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5473	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5474	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5475	/* gap */
5476	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5477	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5478	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5479	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5480	/* gap */
5481	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5482	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5483	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5484	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5485	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5486	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
5487	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5488	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5489	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5490	/* gap */
5491	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5492	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5493	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5494	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5495	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5496	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5497	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5498	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5499	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5500	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5501	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5502	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5503	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5504	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5505	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5506	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5507	/* gap */
5508	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5509	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5510	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5511	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5512	/* gap */
5513	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5514	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5515	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5516	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5517	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5518	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5519	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5520	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5521	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5522	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5523	/* gap */
5524	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5525	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
5526	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5527	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5528	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5529	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5530	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5531	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5532	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5533	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
5534	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
5535	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5536	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5537	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5538	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5539	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5540	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5541	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
5542	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5543	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5544	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5545	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5546	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5547	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5548	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5549	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5550	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5551	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5552	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5553	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5554	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5555	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5556	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5557	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5558	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5559	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5560	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5561	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5562	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5563	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5564	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5565	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5566	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5567	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5568};
5569
5570static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
5571	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5572	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5573	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
5574	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
5575	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
5576	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
5577	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
5578	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
5579	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5580	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5581	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5582	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
5583	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
5584	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
5585	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
5586	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
5587	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
5588	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
5589	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
5590	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5591	/* gap */
5592	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5593	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5594	/* gap */
5595	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5596	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5597	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
5598	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5599	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
5600	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
5601	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
5602	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
5603	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
5604	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
5605	/* gap */
5606	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5607	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
5608	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5609	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5610	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
5611	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
5612	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
5613	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
5614	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5615	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5616	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5617	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5618	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5619	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
5620	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5621	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
5622	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
5623	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
5624	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
5625	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5626	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5627	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
5628	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
5629	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5630	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5631	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5632	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5633	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5634	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5635	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
5636	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5637	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5638	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5639	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
5640	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
5641	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
5642	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
5643	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5644	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5645	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
5646	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
5647	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
5648	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
5649	/* gap */
5650	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5651	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5652	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
5653	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
5654	/* gap */
5655	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5656	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5657	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5658	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5659	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
5660	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
5661	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
5662	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
5663	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
5664	/* gap */
5665	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5666	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
5667	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
5668	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
5669	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
5670	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5671	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
5672	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5673	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
5674	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5675	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5676	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
5677	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5678	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
5679	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5680	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
5681	/* gap */
5682	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5683	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5684	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5685	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5686	/* gap */
5687	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5688	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5689	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5690	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5691	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5692	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5693	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5694	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5695	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
5696	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
5697	/* gap */
5698	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5699	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
5700	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
5701	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
5702	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
5703	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
5704	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
5705	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
5706	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
5707	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
5708	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
5709	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
5710	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
5711	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
5712	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
5713	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
5714	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
5715	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
5716	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
5717	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
5718	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
5719	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
5720	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
5721	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5722	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5723	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5724	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5725	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
5726	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
5727	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
5728	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
5729	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
5730	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
5731	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
5732	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
5733	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
5734	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
5735	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
5736	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
5737	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
5738	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
5739	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
5740	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
5741	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
5742};
5743