r600_shader.c revision f9caabe8f1bff86d19b53d9ecba5c72b238d9e23
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "sb/sb_public.h"
31
32#include "pipe/p_shader_tokens.h"
33#include "tgsi/tgsi_info.h"
34#include "tgsi/tgsi_parse.h"
35#include "tgsi/tgsi_scan.h"
36#include "tgsi/tgsi_dump.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64static int r600_shader_from_tgsi(struct r600_context *rctx,
65				 struct r600_pipe_shader *pipeshader,
66				 union r600_shader_key key);
67
68
69static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70                           int size, unsigned comp_mask) {
71
72	if (!size)
73		return;
74
75	if (ps->num_arrays == ps->max_arrays) {
76		ps->max_arrays += 64;
77		ps->arrays = realloc(ps->arrays, ps->max_arrays *
78		                     sizeof(struct r600_shader_array));
79	}
80
81	int n = ps->num_arrays;
82	++ps->num_arrays;
83
84	ps->arrays[n].comp_mask = comp_mask;
85	ps->arrays[n].gpr_start = start_gpr;
86	ps->arrays[n].gpr_count = size;
87}
88
89static void r600_dump_streamout(struct pipe_stream_output_info *so)
90{
91	unsigned i;
92
93	fprintf(stderr, "STREAMOUT\n");
94	for (i = 0; i < so->num_outputs; i++) {
95		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96				so->output[i].start_component;
97		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98			i,
99			so->output[i].stream,
100			so->output[i].output_buffer,
101			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102			so->output[i].register_index,
103			mask & 1 ? "x" : "",
104		        mask & 2 ? "y" : "",
105		        mask & 4 ? "z" : "",
106		        mask & 8 ? "w" : "",
107			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108	}
109}
110
111static int store_shader(struct pipe_context *ctx,
112			struct r600_pipe_shader *shader)
113{
114	struct r600_context *rctx = (struct r600_context *)ctx;
115	uint32_t *ptr, i;
116
117	if (shader->bo == NULL) {
118		shader->bo = (struct r600_resource*)
119			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120		if (shader->bo == NULL) {
121			return -ENOMEM;
122		}
123		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124		if (R600_BIG_ENDIAN) {
125			for (i = 0; i < shader->shader.bc.ndw; ++i) {
126				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127			}
128		} else {
129			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130		}
131		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132	}
133
134	return 0;
135}
136
137int r600_pipe_shader_create(struct pipe_context *ctx,
138			    struct r600_pipe_shader *shader,
139			    union r600_shader_key key)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	struct r600_pipe_shader_selector *sel = shader->selector;
143	int r;
144	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147	unsigned export_shader;
148
149	shader->shader.bc.isa = rctx->isa;
150
151	if (dump) {
152		fprintf(stderr, "--------------------------------------------------------------\n");
153		tgsi_dump(sel->tokens, 0);
154
155		if (sel->so.num_outputs) {
156			r600_dump_streamout(&sel->so);
157		}
158	}
159	r = r600_shader_from_tgsi(rctx, shader, key);
160	if (r) {
161		R600_ERR("translation from TGSI failed !\n");
162		goto error;
163	}
164
165    /* disable SB for geom shaders on R6xx/R7xx due to some mysterious gs piglit regressions with it enabled. */
166    if (rctx->b.chip_class <= R700) {
167	    use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
168    }
169	/* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */
170	use_sb &= !shader->shader.uses_index_registers;
171	/* disable SB for shaders using doubles */
172	use_sb &= !shader->shader.uses_doubles;
173
174	/* Check if the bytecode has already been built.  When using the llvm
175	 * backend, r600_shader_from_tgsi() will take care of building the
176	 * bytecode.
177	 */
178	if (!shader->shader.bc.bytecode) {
179		r = r600_bytecode_build(&shader->shader.bc);
180		if (r) {
181			R600_ERR("building bytecode failed !\n");
182			goto error;
183		}
184	}
185
186	if (dump && !sb_disasm) {
187		fprintf(stderr, "--------------------------------------------------------------\n");
188		r600_bytecode_disasm(&shader->shader.bc);
189		fprintf(stderr, "______________________________________________________________\n");
190	} else if ((dump && sb_disasm) || use_sb) {
191		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
192		                             dump, use_sb);
193		if (r) {
194			R600_ERR("r600_sb_bytecode_process failed !\n");
195			goto error;
196		}
197	}
198
199	if (shader->gs_copy_shader) {
200		if (dump) {
201			// dump copy shader
202			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
203						     &shader->gs_copy_shader->shader, dump, 0);
204			if (r)
205				goto error;
206		}
207
208		if ((r = store_shader(ctx, shader->gs_copy_shader)))
209			goto error;
210	}
211
212	/* Store the shader in a buffer. */
213	if ((r = store_shader(ctx, shader)))
214		goto error;
215
216	/* Build state. */
217	switch (shader->shader.processor_type) {
218	case TGSI_PROCESSOR_GEOMETRY:
219		if (rctx->b.chip_class >= EVERGREEN) {
220			evergreen_update_gs_state(ctx, shader);
221			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
222		} else {
223			r600_update_gs_state(ctx, shader);
224			r600_update_vs_state(ctx, shader->gs_copy_shader);
225		}
226		break;
227	case TGSI_PROCESSOR_VERTEX:
228		export_shader = key.vs.as_es;
229		if (rctx->b.chip_class >= EVERGREEN) {
230			if (export_shader)
231				evergreen_update_es_state(ctx, shader);
232			else
233				evergreen_update_vs_state(ctx, shader);
234		} else {
235			if (export_shader)
236				r600_update_es_state(ctx, shader);
237			else
238				r600_update_vs_state(ctx, shader);
239		}
240		break;
241	case TGSI_PROCESSOR_FRAGMENT:
242		if (rctx->b.chip_class >= EVERGREEN) {
243			evergreen_update_ps_state(ctx, shader);
244		} else {
245			r600_update_ps_state(ctx, shader);
246		}
247		break;
248	default:
249		r = -EINVAL;
250		goto error;
251	}
252	return 0;
253
254error:
255	r600_pipe_shader_destroy(ctx, shader);
256	return r;
257}
258
259void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
260{
261	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
262	r600_bytecode_clear(&shader->shader.bc);
263	r600_release_command_buffer(&shader->command_buffer);
264}
265
266/*
267 * tgsi -> r600 shader
268 */
269struct r600_shader_tgsi_instruction;
270
271struct r600_shader_src {
272	unsigned				sel;
273	unsigned				swizzle[4];
274	unsigned				neg;
275	unsigned				abs;
276	unsigned				rel;
277	unsigned				kc_bank;
278	boolean					kc_rel; /* true if cache bank is indexed */
279	uint32_t				value[4];
280};
281
282struct eg_interp {
283	boolean					enabled;
284	unsigned				ij_index;
285};
286
287struct r600_shader_ctx {
288	struct tgsi_shader_info			info;
289	struct tgsi_parse_context		parse;
290	const struct tgsi_token			*tokens;
291	unsigned				type;
292	unsigned				file_offset[TGSI_FILE_COUNT];
293	unsigned				temp_reg;
294	const struct r600_shader_tgsi_instruction	*inst_info;
295	struct r600_bytecode			*bc;
296	struct r600_shader			*shader;
297	struct r600_shader_src			src[4];
298	uint32_t				*literals;
299	uint32_t				nliterals;
300	uint32_t				max_driver_temp_used;
301	boolean use_llvm;
302	/* needed for evergreen interpolation */
303	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
304	/* evergreen/cayman also store sample mask in face register */
305	int					face_gpr;
306	/* sample id is .w component stored in fixed point position register */
307	int					fixed_pt_position_gpr;
308	int					colors_used;
309	boolean                 clip_vertex_write;
310	unsigned                cv_output;
311	unsigned		edgeflag_output;
312	int					fragcoord_input;
313	int					native_integers;
314	int					next_ring_offset;
315	int					gs_out_ring_offset;
316	int					gs_next_vertex;
317	struct r600_shader	*gs_for_vs;
318	int					gs_export_gpr_tregs[4];
319	const struct pipe_stream_output_info	*gs_stream_output_info;
320	unsigned				enabled_stream_buffers_mask;
321};
322
323struct r600_shader_tgsi_instruction {
324	unsigned	op;
325	int (*process)(struct r600_shader_ctx *ctx);
326};
327
328static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
329static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
330static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
331static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
332static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
333static int tgsi_else(struct r600_shader_ctx *ctx);
334static int tgsi_endif(struct r600_shader_ctx *ctx);
335static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
336static int tgsi_endloop(struct r600_shader_ctx *ctx);
337static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
338static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
339                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
340                                unsigned int dst_reg);
341static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
342			const struct r600_shader_src *shader_src,
343			unsigned chan);
344
345static int tgsi_is_supported(struct r600_shader_ctx *ctx)
346{
347	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
348	int j;
349
350	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
351		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
352		return -EINVAL;
353	}
354	if (i->Instruction.Predicate) {
355		R600_ERR("predicate unsupported\n");
356		return -EINVAL;
357	}
358#if 0
359	if (i->Instruction.Label) {
360		R600_ERR("label unsupported\n");
361		return -EINVAL;
362	}
363#endif
364	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
365		if (i->Src[j].Register.Dimension) {
366		   switch (i->Src[j].Register.File) {
367		   case TGSI_FILE_CONSTANT:
368			   break;
369		   case TGSI_FILE_INPUT:
370			   if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
371				   break;
372		   default:
373			   R600_ERR("unsupported src %d (dimension %d)\n", j,
374				    i->Src[j].Register.Dimension);
375			   return -EINVAL;
376		   }
377		}
378	}
379	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
380		if (i->Dst[j].Register.Dimension) {
381			R600_ERR("unsupported dst (dimension)\n");
382			return -EINVAL;
383		}
384	}
385	return 0;
386}
387
388int eg_get_interpolator_index(unsigned interpolate, unsigned location)
389{
390	if (interpolate == TGSI_INTERPOLATE_COLOR ||
391		interpolate == TGSI_INTERPOLATE_LINEAR ||
392		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
393	{
394		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
395		int loc;
396
397		switch(location) {
398		case TGSI_INTERPOLATE_LOC_CENTER:
399			loc = 1;
400			break;
401		case TGSI_INTERPOLATE_LOC_CENTROID:
402			loc = 2;
403			break;
404		case TGSI_INTERPOLATE_LOC_SAMPLE:
405		default:
406			loc = 0; break;
407		}
408
409		return is_linear * 3 + loc;
410	}
411
412	return -1;
413}
414
415static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
416		int input)
417{
418	int i = eg_get_interpolator_index(
419		ctx->shader->input[input].interpolate,
420		ctx->shader->input[input].interpolate_location);
421	assert(i >= 0);
422	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
423}
424
425static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
426{
427	int i, r;
428	struct r600_bytecode_alu alu;
429	int gpr = 0, base_chan = 0;
430	int ij_index = ctx->shader->input[input].ij_index;
431
432	/* work out gpr and base_chan from index */
433	gpr = ij_index / 2;
434	base_chan = (2 * (ij_index % 2)) + 1;
435
436	for (i = 0; i < 8; i++) {
437		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
438
439		if (i < 4)
440			alu.op = ALU_OP2_INTERP_ZW;
441		else
442			alu.op = ALU_OP2_INTERP_XY;
443
444		if ((i > 1) && (i < 6)) {
445			alu.dst.sel = ctx->shader->input[input].gpr;
446			alu.dst.write = 1;
447		}
448
449		alu.dst.chan = i % 4;
450
451		alu.src[0].sel = gpr;
452		alu.src[0].chan = (base_chan - (i % 2));
453
454		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
455
456		alu.bank_swizzle_force = SQ_ALU_VEC_210;
457		if ((i % 4) == 3)
458			alu.last = 1;
459		r = r600_bytecode_add_alu(ctx->bc, &alu);
460		if (r)
461			return r;
462	}
463	return 0;
464}
465
466static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
467{
468	int i, r;
469	struct r600_bytecode_alu alu;
470
471	for (i = 0; i < 4; i++) {
472		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
473
474		alu.op = ALU_OP1_INTERP_LOAD_P0;
475
476		alu.dst.sel = ctx->shader->input[input].gpr;
477		alu.dst.write = 1;
478
479		alu.dst.chan = i;
480
481		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
482		alu.src[0].chan = i;
483
484		if (i == 3)
485			alu.last = 1;
486		r = r600_bytecode_add_alu(ctx->bc, &alu);
487		if (r)
488			return r;
489	}
490	return 0;
491}
492
493/*
494 * Special export handling in shaders
495 *
496 * shader export ARRAY_BASE for EXPORT_POS:
497 * 60 is position
498 * 61 is misc vector
499 * 62, 63 are clip distance vectors
500 *
501 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
502 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
503 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
504 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
505 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
506 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
507 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
508 * exclusive from render target index)
509 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
510 *
511 *
512 * shader export ARRAY_BASE for EXPORT_PIXEL:
513 * 0-7 CB targets
514 * 61 computed Z vector
515 *
516 * The use of the values exported in the computed Z vector are controlled
517 * by DB_SHADER_CONTROL:
518 * Z_EXPORT_ENABLE - Z as a float in RED
519 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
520 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
521 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
522 * DB_SOURCE_FORMAT - export control restrictions
523 *
524 */
525
526
527/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
528static int r600_spi_sid(struct r600_shader_io * io)
529{
530	int index, name = io->name;
531
532	/* These params are handled differently, they don't need
533	 * semantic indices, so we'll use 0 for them.
534	 */
535	if (name == TGSI_SEMANTIC_POSITION ||
536	    name == TGSI_SEMANTIC_PSIZE ||
537	    name == TGSI_SEMANTIC_EDGEFLAG ||
538	    name == TGSI_SEMANTIC_FACE ||
539	    name == TGSI_SEMANTIC_SAMPLEMASK)
540		index = 0;
541	else {
542		if (name == TGSI_SEMANTIC_GENERIC) {
543			/* For generic params simply use sid from tgsi */
544			index = io->sid;
545		} else {
546			/* For non-generic params - pack name and sid into 8 bits */
547			index = 0x80 | (name<<3) | (io->sid);
548		}
549
550		/* Make sure that all really used indices have nonzero value, so
551		 * we can just compare it to 0 later instead of comparing the name
552		 * with different values to detect special cases. */
553		index++;
554	}
555
556	return index;
557};
558
559/* turn input into interpolate on EG */
560static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
561{
562	int r = 0;
563
564	if (ctx->shader->input[index].spi_sid) {
565		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
566		if (ctx->shader->input[index].interpolate > 0) {
567			evergreen_interp_assign_ij_index(ctx, index);
568			if (!ctx->use_llvm)
569				r = evergreen_interp_alu(ctx, index);
570		} else {
571			if (!ctx->use_llvm)
572				r = evergreen_interp_flat(ctx, index);
573		}
574	}
575	return r;
576}
577
578static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
579{
580	struct r600_bytecode_alu alu;
581	int i, r;
582	int gpr_front = ctx->shader->input[front].gpr;
583	int gpr_back = ctx->shader->input[back].gpr;
584
585	for (i = 0; i < 4; i++) {
586		memset(&alu, 0, sizeof(alu));
587		alu.op = ALU_OP3_CNDGT;
588		alu.is_op3 = 1;
589		alu.dst.write = 1;
590		alu.dst.sel = gpr_front;
591		alu.src[0].sel = ctx->face_gpr;
592		alu.src[1].sel = gpr_front;
593		alu.src[2].sel = gpr_back;
594
595		alu.dst.chan = i;
596		alu.src[1].chan = i;
597		alu.src[2].chan = i;
598		alu.last = (i==3);
599
600		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
601			return r;
602	}
603
604	return 0;
605}
606
607static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
608{
609	int i;
610	i = ctx->shader->noutput++;
611	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
612	ctx->shader->output[i].sid = 0;
613	ctx->shader->output[i].gpr = 0;
614	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
615	ctx->shader->output[i].write_mask = 0x4;
616	ctx->shader->output[i].spi_sid = prim_id_sid;
617
618	return 0;
619}
620
621static int tgsi_declaration(struct r600_shader_ctx *ctx)
622{
623	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
624	int r, i, j, count = d->Range.Last - d->Range.First + 1;
625
626	switch (d->Declaration.File) {
627	case TGSI_FILE_INPUT:
628		for (j = 0; j < count; j++) {
629			i = ctx->shader->ninput + j;
630			assert(i < Elements(ctx->shader->input));
631			ctx->shader->input[i].name = d->Semantic.Name;
632			ctx->shader->input[i].sid = d->Semantic.Index + j;
633			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
634			ctx->shader->input[i].interpolate_location = d->Interp.Location;
635			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
636			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
637				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
638				switch (ctx->shader->input[i].name) {
639				case TGSI_SEMANTIC_FACE:
640					if (ctx->face_gpr != -1)
641						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
642					else
643						ctx->face_gpr = ctx->shader->input[i].gpr;
644					break;
645				case TGSI_SEMANTIC_COLOR:
646					ctx->colors_used++;
647					break;
648				case TGSI_SEMANTIC_POSITION:
649					ctx->fragcoord_input = i;
650					break;
651				case TGSI_SEMANTIC_PRIMID:
652					/* set this for now */
653					ctx->shader->gs_prim_id_input = true;
654					ctx->shader->ps_prim_id_input = i;
655					break;
656				}
657				if (ctx->bc->chip_class >= EVERGREEN) {
658					if ((r = evergreen_interp_input(ctx, i)))
659						return r;
660				}
661			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
662				/* FIXME probably skip inputs if they aren't passed in the ring */
663				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
664				ctx->next_ring_offset += 16;
665				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
666					ctx->shader->gs_prim_id_input = true;
667			}
668		}
669		ctx->shader->ninput += count;
670		break;
671	case TGSI_FILE_OUTPUT:
672		for (j = 0; j < count; j++) {
673			i = ctx->shader->noutput + j;
674			assert(i < Elements(ctx->shader->output));
675			ctx->shader->output[i].name = d->Semantic.Name;
676			ctx->shader->output[i].sid = d->Semantic.Index + j;
677			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
678			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
679			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
680			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
681			    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
682				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
683				switch (d->Semantic.Name) {
684				case TGSI_SEMANTIC_CLIPDIST:
685					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
686									((d->Semantic.Index + j) << 2);
687					break;
688				case TGSI_SEMANTIC_PSIZE:
689					ctx->shader->vs_out_misc_write = 1;
690					ctx->shader->vs_out_point_size = 1;
691					break;
692				case TGSI_SEMANTIC_EDGEFLAG:
693					ctx->shader->vs_out_misc_write = 1;
694					ctx->shader->vs_out_edgeflag = 1;
695					ctx->edgeflag_output = i;
696					break;
697				case TGSI_SEMANTIC_VIEWPORT_INDEX:
698					ctx->shader->vs_out_misc_write = 1;
699					ctx->shader->vs_out_viewport = 1;
700					break;
701				case TGSI_SEMANTIC_LAYER:
702					ctx->shader->vs_out_misc_write = 1;
703					ctx->shader->vs_out_layer = 1;
704					break;
705				case TGSI_SEMANTIC_CLIPVERTEX:
706					ctx->clip_vertex_write = TRUE;
707					ctx->cv_output = i;
708					break;
709				}
710				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
711					ctx->gs_out_ring_offset += 16;
712				}
713			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
714				switch (d->Semantic.Name) {
715				case TGSI_SEMANTIC_COLOR:
716					ctx->shader->nr_ps_max_color_exports++;
717					break;
718				}
719			}
720		}
721		ctx->shader->noutput += count;
722		break;
723	case TGSI_FILE_TEMPORARY:
724		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
725			if (d->Array.ArrayID) {
726				r600_add_gpr_array(ctx->shader,
727				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
728								   d->Range.First,
729				               d->Range.Last - d->Range.First + 1, 0x0F);
730			}
731		}
732		break;
733
734	case TGSI_FILE_CONSTANT:
735	case TGSI_FILE_SAMPLER:
736	case TGSI_FILE_SAMPLER_VIEW:
737	case TGSI_FILE_ADDRESS:
738		break;
739
740	case TGSI_FILE_SYSTEM_VALUE:
741		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
742			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
743			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
744			break; /* Already handled from allocate_system_value_inputs */
745		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
746			if (!ctx->native_integers) {
747				struct r600_bytecode_alu alu;
748				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
749
750				alu.op = ALU_OP1_INT_TO_FLT;
751				alu.src[0].sel = 0;
752				alu.src[0].chan = 3;
753
754				alu.dst.sel = 0;
755				alu.dst.chan = 3;
756				alu.dst.write = 1;
757				alu.last = 1;
758
759				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
760					return r;
761			}
762			break;
763		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
764			break;
765		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
766			break;
767	default:
768		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
769		return -EINVAL;
770	}
771	return 0;
772}
773
774static int r600_get_temp(struct r600_shader_ctx *ctx)
775{
776	return ctx->temp_reg + ctx->max_driver_temp_used++;
777}
778
779static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
780{
781	struct tgsi_parse_context parse;
782	struct {
783		boolean enabled;
784		int *reg;
785		unsigned name, alternate_name;
786	} inputs[2] = {
787		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
788
789		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
790	};
791	int i, k, num_regs = 0;
792
793	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
794		return 0;
795	}
796
797	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
798	while (!tgsi_parse_end_of_tokens(&parse)) {
799		tgsi_parse_token(&parse);
800
801		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
802			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
803			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
804				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
805				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
806			{
807				int interpolate, location, k;
808
809				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
810					location = TGSI_INTERPOLATE_LOC_CENTER;
811					inputs[1].enabled = true; /* needs SAMPLEID */
812				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
813					location = TGSI_INTERPOLATE_LOC_CENTER;
814					/* Needs sample positions, currently those are always available */
815				} else {
816					location = TGSI_INTERPOLATE_LOC_CENTROID;
817				}
818
819				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
820				k = eg_get_interpolator_index(interpolate, location);
821				ctx->eg_interpolators[k].enabled = true;
822			}
823		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
824			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
825			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
826				for (k = 0; k < Elements(inputs); k++) {
827					if (d->Semantic.Name == inputs[k].name ||
828						d->Semantic.Name == inputs[k].alternate_name) {
829						inputs[k].enabled = true;
830					}
831				}
832			}
833		}
834	}
835
836	tgsi_parse_free(&parse);
837
838	for (i = 0; i < Elements(inputs); i++) {
839		boolean enabled = inputs[i].enabled;
840		int *reg = inputs[i].reg;
841		unsigned name = inputs[i].name;
842
843		if (enabled) {
844			int gpr = gpr_offset + num_regs++;
845
846			// add to inputs, allocate a gpr
847			k = ctx->shader->ninput ++;
848			ctx->shader->input[k].name = name;
849			ctx->shader->input[k].sid = 0;
850			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
851			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
852			*reg = ctx->shader->input[k].gpr = gpr;
853		}
854	}
855
856	return gpr_offset + num_regs;
857}
858
859/*
860 * for evergreen we need to scan the shader to find the number of GPRs we need to
861 * reserve for interpolation and system values
862 *
863 * we need to know if we are going to emit
864 * any sample or centroid inputs
865 * if perspective and linear are required
866*/
867static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
868{
869	int i;
870	int num_baryc;
871	struct tgsi_parse_context parse;
872
873	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
874
875	for (i = 0; i < ctx->info.num_inputs; i++) {
876		int k;
877		/* skip position/face/mask/sampleid */
878		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
879		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
880		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
881		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
882			continue;
883
884		k = eg_get_interpolator_index(
885			ctx->info.input_interpolate[i],
886			ctx->info.input_interpolate_loc[i]);
887		if (k >= 0)
888			ctx->eg_interpolators[k].enabled = TRUE;
889	}
890
891	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
892		return 0;
893	}
894
895	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
896	while (!tgsi_parse_end_of_tokens(&parse)) {
897		tgsi_parse_token(&parse);
898
899		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
900			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
901			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
902				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
903				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
904			{
905				int interpolate, location, k;
906
907				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
908					location = TGSI_INTERPOLATE_LOC_CENTER;
909				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
910					location = TGSI_INTERPOLATE_LOC_CENTER;
911				} else {
912					location = TGSI_INTERPOLATE_LOC_CENTROID;
913				}
914
915				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
916				k = eg_get_interpolator_index(interpolate, location);
917				ctx->eg_interpolators[k].enabled = true;
918			}
919		}
920	}
921
922	tgsi_parse_free(&parse);
923
924	/* assign gpr to each interpolator according to priority */
925	num_baryc = 0;
926	for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
927		if (ctx->eg_interpolators[i].enabled) {
928			ctx->eg_interpolators[i].ij_index = num_baryc;
929			num_baryc ++;
930		}
931	}
932
933	/* XXX PULL MODEL and LINE STIPPLE */
934
935	num_baryc = (num_baryc + 1) >> 1;
936	return allocate_system_value_inputs(ctx, num_baryc);
937}
938
939/* sample_id_sel == NULL means fetch for current sample */
940static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
941{
942	struct r600_bytecode_vtx vtx;
943	int r, t1;
944
945	assert(ctx->fixed_pt_position_gpr != -1);
946
947	t1 = r600_get_temp(ctx);
948
949	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
950	vtx.op = FETCH_OP_VFETCH;
951	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
952	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
953	if (sample_id == NULL) {
954		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
955		vtx.src_sel_x = 3;
956	}
957	else {
958		struct r600_bytecode_alu alu;
959
960		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
961		alu.op = ALU_OP1_MOV;
962		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
963		alu.dst.sel = t1;
964		alu.dst.write = 1;
965		alu.last = 1;
966		r = r600_bytecode_add_alu(ctx->bc, &alu);
967		if (r)
968			return r;
969
970		vtx.src_gpr = t1;
971		vtx.src_sel_x = 0;
972	}
973	vtx.mega_fetch_count = 16;
974	vtx.dst_gpr = t1;
975	vtx.dst_sel_x = 0;
976	vtx.dst_sel_y = 1;
977	vtx.dst_sel_z = 2;
978	vtx.dst_sel_w = 3;
979	vtx.data_format = FMT_32_32_32_32_FLOAT;
980	vtx.num_format_all = 2;
981	vtx.format_comp_all = 1;
982	vtx.use_const_fields = 0;
983	vtx.offset = 1; // first element is size of buffer
984	vtx.endian = r600_endian_swap(32);
985	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
986
987	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
988	if (r)
989		return r;
990
991	return t1;
992}
993
994static void tgsi_src(struct r600_shader_ctx *ctx,
995		     const struct tgsi_full_src_register *tgsi_src,
996		     struct r600_shader_src *r600_src)
997{
998	memset(r600_src, 0, sizeof(*r600_src));
999	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1000	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1001	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1002	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1003	r600_src->neg = tgsi_src->Register.Negate;
1004	r600_src->abs = tgsi_src->Register.Absolute;
1005
1006	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1007		int index;
1008		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1009			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1010			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1011
1012			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1013			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
1014			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1015				return;
1016		}
1017		index = tgsi_src->Register.Index;
1018		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1019		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1020	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1021		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1022			r600_src->swizzle[0] = 2; // Z value
1023			r600_src->swizzle[1] = 2;
1024			r600_src->swizzle[2] = 2;
1025			r600_src->swizzle[3] = 2;
1026			r600_src->sel = ctx->face_gpr;
1027		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1028			r600_src->swizzle[0] = 3; // W value
1029			r600_src->swizzle[1] = 3;
1030			r600_src->swizzle[2] = 3;
1031			r600_src->swizzle[3] = 3;
1032			r600_src->sel = ctx->fixed_pt_position_gpr;
1033		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1034			r600_src->swizzle[0] = 0;
1035			r600_src->swizzle[1] = 1;
1036			r600_src->swizzle[2] = 4;
1037			r600_src->swizzle[3] = 4;
1038			r600_src->sel = load_sample_position(ctx, NULL, -1);
1039		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1040			r600_src->swizzle[0] = 3;
1041			r600_src->swizzle[1] = 3;
1042			r600_src->swizzle[2] = 3;
1043			r600_src->swizzle[3] = 3;
1044			r600_src->sel = 0;
1045		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1046			r600_src->swizzle[0] = 0;
1047			r600_src->swizzle[1] = 0;
1048			r600_src->swizzle[2] = 0;
1049			r600_src->swizzle[3] = 0;
1050			r600_src->sel = 0;
1051		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1052			r600_src->swizzle[0] = 3;
1053			r600_src->swizzle[1] = 3;
1054			r600_src->swizzle[2] = 3;
1055			r600_src->swizzle[3] = 3;
1056			r600_src->sel = 1;
1057		}
1058	} else {
1059		if (tgsi_src->Register.Indirect)
1060			r600_src->rel = V_SQ_REL_RELATIVE;
1061		r600_src->sel = tgsi_src->Register.Index;
1062		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1063	}
1064	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1065		if (tgsi_src->Register.Dimension) {
1066			r600_src->kc_bank = tgsi_src->Dimension.Index;
1067			if (tgsi_src->Dimension.Indirect) {
1068				r600_src->kc_rel = 1;
1069			}
1070		}
1071	}
1072}
1073
1074static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1075                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1076                                unsigned int dst_reg)
1077{
1078	struct r600_bytecode_vtx vtx;
1079	unsigned int ar_reg;
1080	int r;
1081
1082	if (offset) {
1083		struct r600_bytecode_alu alu;
1084
1085		memset(&alu, 0, sizeof(alu));
1086
1087		alu.op = ALU_OP2_ADD_INT;
1088		alu.src[0].sel = ctx->bc->ar_reg;
1089		alu.src[0].chan = ar_chan;
1090
1091		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1092		alu.src[1].value = offset;
1093
1094		alu.dst.sel = dst_reg;
1095		alu.dst.chan = ar_chan;
1096		alu.dst.write = 1;
1097		alu.last = 1;
1098
1099		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1100			return r;
1101
1102		ar_reg = dst_reg;
1103	} else {
1104		ar_reg = ctx->bc->ar_reg;
1105	}
1106
1107	memset(&vtx, 0, sizeof(vtx));
1108	vtx.buffer_id = cb_idx;
1109	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1110	vtx.src_gpr = ar_reg;
1111	vtx.src_sel_x = ar_chan;
1112	vtx.mega_fetch_count = 16;
1113	vtx.dst_gpr = dst_reg;
1114	vtx.dst_sel_x = 0;		/* SEL_X */
1115	vtx.dst_sel_y = 1;		/* SEL_Y */
1116	vtx.dst_sel_z = 2;		/* SEL_Z */
1117	vtx.dst_sel_w = 3;		/* SEL_W */
1118	vtx.data_format = FMT_32_32_32_32_FLOAT;
1119	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1120	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1121	vtx.endian = r600_endian_swap(32);
1122	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1123
1124	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1125		return r;
1126
1127	return 0;
1128}
1129
1130static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1131{
1132	struct r600_bytecode_vtx vtx;
1133	int r;
1134	unsigned index = src->Register.Index;
1135	unsigned vtx_id = src->Dimension.Index;
1136	int offset_reg = vtx_id / 3;
1137	int offset_chan = vtx_id % 3;
1138
1139	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1140	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1141
1142	if (offset_reg == 0 && offset_chan == 2)
1143		offset_chan = 3;
1144
1145	if (src->Dimension.Indirect) {
1146		int treg[3];
1147		int t2;
1148		struct r600_bytecode_alu alu;
1149		int r, i;
1150
1151		/* you have got to be shitting me -
1152		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1153		   at least this is what fglrx seems to do. */
1154		for (i = 0; i < 3; i++) {
1155			treg[i] = r600_get_temp(ctx);
1156		}
1157		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1158
1159		t2 = r600_get_temp(ctx);
1160		for (i = 0; i < 3; i++) {
1161			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1162			alu.op = ALU_OP1_MOV;
1163			alu.src[0].sel = 0;
1164			alu.src[0].chan = i == 2 ? 3 : i;
1165			alu.dst.sel = treg[i];
1166			alu.dst.chan = 0;
1167			alu.dst.write = 1;
1168			alu.last = 1;
1169			r = r600_bytecode_add_alu(ctx->bc, &alu);
1170			if (r)
1171				return r;
1172		}
1173		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1174		alu.op = ALU_OP1_MOV;
1175		alu.src[0].sel = treg[0];
1176		alu.src[0].rel = 1;
1177		alu.dst.sel = t2;
1178		alu.dst.write = 1;
1179		alu.last = 1;
1180		r = r600_bytecode_add_alu(ctx->bc, &alu);
1181		if (r)
1182			return r;
1183		offset_reg = t2;
1184	}
1185
1186
1187	memset(&vtx, 0, sizeof(vtx));
1188	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1189	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1190	vtx.src_gpr = offset_reg;
1191	vtx.src_sel_x = offset_chan;
1192	vtx.offset = index * 16; /*bytes*/
1193	vtx.mega_fetch_count = 16;
1194	vtx.dst_gpr = dst_reg;
1195	vtx.dst_sel_x = 0;		/* SEL_X */
1196	vtx.dst_sel_y = 1;		/* SEL_Y */
1197	vtx.dst_sel_z = 2;		/* SEL_Z */
1198	vtx.dst_sel_w = 3;		/* SEL_W */
1199	if (ctx->bc->chip_class >= EVERGREEN) {
1200		vtx.use_const_fields = 1;
1201	} else {
1202		vtx.data_format = FMT_32_32_32_32_FLOAT;
1203	}
1204
1205	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1206		return r;
1207
1208	return 0;
1209}
1210
1211static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1212{
1213	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1214	int i;
1215
1216	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1217		struct tgsi_full_src_register *src = &inst->Src[i];
1218
1219		if (src->Register.File == TGSI_FILE_INPUT) {
1220			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1221				/* primitive id is in R0.z */
1222				ctx->src[i].sel = 0;
1223				ctx->src[i].swizzle[0] = 2;
1224			}
1225		}
1226		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1227			int treg = r600_get_temp(ctx);
1228
1229			fetch_gs_input(ctx, src, treg);
1230			ctx->src[i].sel = treg;
1231		}
1232	}
1233	return 0;
1234}
1235
1236static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1237{
1238	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1239	struct r600_bytecode_alu alu;
1240	int i, j, k, nconst, r;
1241
1242	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1243		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1244			nconst++;
1245		}
1246		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1247	}
1248	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1249		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1250			continue;
1251		}
1252
1253		if (ctx->src[i].kc_rel)
1254			ctx->shader->uses_index_registers = true;
1255
1256		if (ctx->src[i].rel) {
1257			int chan = inst->Src[i].Indirect.Swizzle;
1258			int treg = r600_get_temp(ctx);
1259			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1260				return r;
1261
1262			ctx->src[i].kc_bank = 0;
1263			ctx->src[i].kc_rel = 0;
1264			ctx->src[i].sel = treg;
1265			ctx->src[i].rel = 0;
1266			j--;
1267		} else if (j > 0) {
1268			int treg = r600_get_temp(ctx);
1269			for (k = 0; k < 4; k++) {
1270				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1271				alu.op = ALU_OP1_MOV;
1272				alu.src[0].sel = ctx->src[i].sel;
1273				alu.src[0].chan = k;
1274				alu.src[0].rel = ctx->src[i].rel;
1275				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1276				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1277				alu.dst.sel = treg;
1278				alu.dst.chan = k;
1279				alu.dst.write = 1;
1280				if (k == 3)
1281					alu.last = 1;
1282				r = r600_bytecode_add_alu(ctx->bc, &alu);
1283				if (r)
1284					return r;
1285			}
1286			ctx->src[i].sel = treg;
1287			ctx->src[i].rel =0;
1288			j--;
1289		}
1290	}
1291	return 0;
1292}
1293
1294/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1295static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1296{
1297	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1298	struct r600_bytecode_alu alu;
1299	int i, j, k, nliteral, r;
1300
1301	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1302		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1303			nliteral++;
1304		}
1305	}
1306	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1307		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1308			int treg = r600_get_temp(ctx);
1309			for (k = 0; k < 4; k++) {
1310				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1311				alu.op = ALU_OP1_MOV;
1312				alu.src[0].sel = ctx->src[i].sel;
1313				alu.src[0].chan = k;
1314				alu.src[0].value = ctx->src[i].value[k];
1315				alu.dst.sel = treg;
1316				alu.dst.chan = k;
1317				alu.dst.write = 1;
1318				if (k == 3)
1319					alu.last = 1;
1320				r = r600_bytecode_add_alu(ctx->bc, &alu);
1321				if (r)
1322					return r;
1323			}
1324			ctx->src[i].sel = treg;
1325			j--;
1326		}
1327	}
1328	return 0;
1329}
1330
1331static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1332{
1333	int i, r, count = ctx->shader->ninput;
1334
1335	for (i = 0; i < count; i++) {
1336		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1337			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1338			if (r)
1339				return r;
1340		}
1341	}
1342	return 0;
1343}
1344
1345static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1346						  int stream, unsigned *stream_item_size)
1347{
1348	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1349	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1350	int i, j, r;
1351
1352	/* Sanity checking. */
1353	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1354		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1355		r = -EINVAL;
1356		goto out_err;
1357	}
1358	for (i = 0; i < so->num_outputs; i++) {
1359		if (so->output[i].output_buffer >= 4) {
1360			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1361				 so->output[i].output_buffer);
1362			r = -EINVAL;
1363			goto out_err;
1364		}
1365	}
1366
1367	/* Initialize locations where the outputs are stored. */
1368	for (i = 0; i < so->num_outputs; i++) {
1369
1370		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1371		start_comp[i] = so->output[i].start_component;
1372		/* Lower outputs with dst_offset < start_component.
1373		 *
1374		 * We can only output 4D vectors with a write mask, e.g. we can
1375		 * only output the W component at offset 3, etc. If we want
1376		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1377		 * to move it to X and output X. */
1378		if (so->output[i].dst_offset < so->output[i].start_component) {
1379			unsigned tmp = r600_get_temp(ctx);
1380
1381			for (j = 0; j < so->output[i].num_components; j++) {
1382				struct r600_bytecode_alu alu;
1383				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1384				alu.op = ALU_OP1_MOV;
1385				alu.src[0].sel = so_gpr[i];
1386				alu.src[0].chan = so->output[i].start_component + j;
1387
1388				alu.dst.sel = tmp;
1389				alu.dst.chan = j;
1390				alu.dst.write = 1;
1391				if (j == so->output[i].num_components - 1)
1392					alu.last = 1;
1393				r = r600_bytecode_add_alu(ctx->bc, &alu);
1394				if (r)
1395					return r;
1396			}
1397			start_comp[i] = 0;
1398			so_gpr[i] = tmp;
1399		}
1400	}
1401
1402	/* Write outputs to buffers. */
1403	for (i = 0; i < so->num_outputs; i++) {
1404		struct r600_bytecode_output output;
1405
1406		if (stream != -1 && stream != so->output[i].output_buffer)
1407			continue;
1408
1409		memset(&output, 0, sizeof(struct r600_bytecode_output));
1410		output.gpr = so_gpr[i];
1411		output.elem_size = so->output[i].num_components - 1;
1412		if (output.elem_size == 2)
1413			output.elem_size = 3; // 3 not supported, write 4 with junk at end
1414		output.array_base = so->output[i].dst_offset - start_comp[i];
1415		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1416		output.burst_count = 1;
1417		/* array_size is an upper limit for the burst_count
1418		 * with MEM_STREAM instructions */
1419		output.array_size = 0xFFF;
1420		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1421
1422		if (ctx->bc->chip_class >= EVERGREEN) {
1423			switch (so->output[i].output_buffer) {
1424			case 0:
1425				output.op = CF_OP_MEM_STREAM0_BUF0;
1426				break;
1427			case 1:
1428				output.op = CF_OP_MEM_STREAM0_BUF1;
1429				break;
1430			case 2:
1431				output.op = CF_OP_MEM_STREAM0_BUF2;
1432				break;
1433			case 3:
1434				output.op = CF_OP_MEM_STREAM0_BUF3;
1435				break;
1436			}
1437			output.op += so->output[i].stream * 4;
1438			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1439			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1440		} else {
1441			switch (so->output[i].output_buffer) {
1442			case 0:
1443				output.op = CF_OP_MEM_STREAM0;
1444				break;
1445			case 1:
1446				output.op = CF_OP_MEM_STREAM1;
1447				break;
1448			case 2:
1449				output.op = CF_OP_MEM_STREAM2;
1450				break;
1451			case 3:
1452				output.op = CF_OP_MEM_STREAM3;
1453					break;
1454			}
1455			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1456		}
1457		r = r600_bytecode_add_output(ctx->bc, &output);
1458		if (r)
1459			goto out_err;
1460	}
1461	return 0;
1462out_err:
1463	return r;
1464}
1465
1466static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1467{
1468	struct r600_bytecode_alu alu;
1469	unsigned reg;
1470
1471	if (!ctx->shader->vs_out_edgeflag)
1472		return;
1473
1474	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1475
1476	/* clamp(x, 0, 1) */
1477	memset(&alu, 0, sizeof(alu));
1478	alu.op = ALU_OP1_MOV;
1479	alu.src[0].sel = reg;
1480	alu.dst.sel = reg;
1481	alu.dst.write = 1;
1482	alu.dst.clamp = 1;
1483	alu.last = 1;
1484	r600_bytecode_add_alu(ctx->bc, &alu);
1485
1486	memset(&alu, 0, sizeof(alu));
1487	alu.op = ALU_OP1_FLT_TO_INT;
1488	alu.src[0].sel = reg;
1489	alu.dst.sel = reg;
1490	alu.dst.write = 1;
1491	alu.last = 1;
1492	r600_bytecode_add_alu(ctx->bc, &alu);
1493}
1494
1495static int generate_gs_copy_shader(struct r600_context *rctx,
1496				   struct r600_pipe_shader *gs,
1497				   struct pipe_stream_output_info *so)
1498{
1499	struct r600_shader_ctx ctx = {};
1500	struct r600_shader *gs_shader = &gs->shader;
1501	struct r600_pipe_shader *cshader;
1502	int ocnt = gs_shader->noutput;
1503	struct r600_bytecode_alu alu;
1504	struct r600_bytecode_vtx vtx;
1505	struct r600_bytecode_output output;
1506	struct r600_bytecode_cf *cf_jump, *cf_pop,
1507		*last_exp_pos = NULL, *last_exp_param = NULL;
1508	int i, j, next_clip_pos = 61, next_param = 0;
1509	int ring;
1510
1511	cshader = calloc(1, sizeof(struct r600_pipe_shader));
1512	if (!cshader)
1513		return 0;
1514
1515	memcpy(cshader->shader.output, gs_shader->output, ocnt *
1516	       sizeof(struct r600_shader_io));
1517
1518	cshader->shader.noutput = ocnt;
1519
1520	ctx.shader = &cshader->shader;
1521	ctx.bc = &ctx.shader->bc;
1522	ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1523
1524	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1525			   rctx->screen->has_compressed_msaa_texturing);
1526
1527	ctx.bc->isa = rctx->isa;
1528
1529	cf_jump = NULL;
1530	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1531
1532	/* R0.x = R0.x & 0x3fffffff */
1533	memset(&alu, 0, sizeof(alu));
1534	alu.op = ALU_OP2_AND_INT;
1535	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1536	alu.src[1].value = 0x3fffffff;
1537	alu.dst.write = 1;
1538	r600_bytecode_add_alu(ctx.bc, &alu);
1539
1540	/* R0.y = R0.x >> 30 */
1541	memset(&alu, 0, sizeof(alu));
1542	alu.op = ALU_OP2_LSHR_INT;
1543	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1544	alu.src[1].value = 0x1e;
1545	alu.dst.chan = 1;
1546	alu.dst.write = 1;
1547	alu.last = 1;
1548	r600_bytecode_add_alu(ctx.bc, &alu);
1549
1550	/* fetch vertex data from GSVS ring */
1551	for (i = 0; i < ocnt; ++i) {
1552		struct r600_shader_io *out = &ctx.shader->output[i];
1553
1554		out->gpr = i + 1;
1555		out->ring_offset = i * 16;
1556
1557		memset(&vtx, 0, sizeof(vtx));
1558		vtx.op = FETCH_OP_VFETCH;
1559		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1560		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1561		vtx.offset = out->ring_offset;
1562		vtx.dst_gpr = out->gpr;
1563		vtx.src_gpr = 0;
1564		vtx.dst_sel_x = 0;
1565		vtx.dst_sel_y = 1;
1566		vtx.dst_sel_z = 2;
1567		vtx.dst_sel_w = 3;
1568		if (rctx->b.chip_class >= EVERGREEN) {
1569			vtx.use_const_fields = 1;
1570		} else {
1571			vtx.data_format = FMT_32_32_32_32_FLOAT;
1572		}
1573
1574		r600_bytecode_add_vtx(ctx.bc, &vtx);
1575	}
1576	ctx.temp_reg = i + 1;
1577	for (ring = 3; ring >= 0; --ring) {
1578		bool enabled = false;
1579		for (i = 0; i < so->num_outputs; i++) {
1580			if (so->output[i].stream == ring) {
1581				enabled = true;
1582				break;
1583			}
1584		}
1585		if (ring != 0 && !enabled) {
1586			cshader->shader.ring_item_sizes[ring] = 0;
1587			continue;
1588		}
1589
1590		if (cf_jump) {
1591			// Patch up jump label
1592			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1593			cf_pop = ctx.bc->cf_last;
1594
1595			cf_jump->cf_addr = cf_pop->id + 2;
1596			cf_jump->pop_count = 1;
1597			cf_pop->cf_addr = cf_pop->id + 2;
1598			cf_pop->pop_count = 1;
1599		}
1600
1601		/* PRED_SETE_INT __, R0.y, ring */
1602		memset(&alu, 0, sizeof(alu));
1603		alu.op = ALU_OP2_PRED_SETE_INT;
1604		alu.src[0].chan = 1;
1605		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1606		alu.src[1].value = ring;
1607		alu.execute_mask = 1;
1608		alu.update_pred = 1;
1609		alu.last = 1;
1610		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1611
1612		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1613		cf_jump = ctx.bc->cf_last;
1614
1615		if (enabled)
1616			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1617		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1618	}
1619
1620	/* export vertex data */
1621	/* XXX factor out common code with r600_shader_from_tgsi ? */
1622	for (i = 0; i < ocnt; ++i) {
1623		struct r600_shader_io *out = &ctx.shader->output[i];
1624		bool instream0 = true;
1625		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1626			continue;
1627
1628		for (j = 0; j < so->num_outputs; j++) {
1629			if (so->output[j].register_index == i) {
1630				if (so->output[j].stream == 0)
1631					break;
1632				if (so->output[j].stream > 0)
1633					instream0 = false;
1634			}
1635		}
1636		if (!instream0)
1637			continue;
1638		memset(&output, 0, sizeof(output));
1639		output.gpr = out->gpr;
1640		output.elem_size = 3;
1641		output.swizzle_x = 0;
1642		output.swizzle_y = 1;
1643		output.swizzle_z = 2;
1644		output.swizzle_w = 3;
1645		output.burst_count = 1;
1646		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1647		output.op = CF_OP_EXPORT;
1648		switch (out->name) {
1649		case TGSI_SEMANTIC_POSITION:
1650			output.array_base = 60;
1651			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1652			break;
1653
1654		case TGSI_SEMANTIC_PSIZE:
1655			output.array_base = 61;
1656			if (next_clip_pos == 61)
1657				next_clip_pos = 62;
1658			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1659			output.swizzle_y = 7;
1660			output.swizzle_z = 7;
1661			output.swizzle_w = 7;
1662			ctx.shader->vs_out_misc_write = 1;
1663			ctx.shader->vs_out_point_size = 1;
1664			break;
1665		case TGSI_SEMANTIC_LAYER:
1666			if (out->spi_sid) {
1667				/* duplicate it as PARAM to pass to the pixel shader */
1668				output.array_base = next_param++;
1669				r600_bytecode_add_output(ctx.bc, &output);
1670				last_exp_param = ctx.bc->cf_last;
1671			}
1672			output.array_base = 61;
1673			if (next_clip_pos == 61)
1674				next_clip_pos = 62;
1675			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1676			output.swizzle_x = 7;
1677			output.swizzle_y = 7;
1678			output.swizzle_z = 0;
1679			output.swizzle_w = 7;
1680			ctx.shader->vs_out_misc_write = 1;
1681			ctx.shader->vs_out_layer = 1;
1682			break;
1683		case TGSI_SEMANTIC_VIEWPORT_INDEX:
1684			if (out->spi_sid) {
1685				/* duplicate it as PARAM to pass to the pixel shader */
1686				output.array_base = next_param++;
1687				r600_bytecode_add_output(ctx.bc, &output);
1688				last_exp_param = ctx.bc->cf_last;
1689			}
1690			output.array_base = 61;
1691			if (next_clip_pos == 61)
1692				next_clip_pos = 62;
1693			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1694			ctx.shader->vs_out_misc_write = 1;
1695			ctx.shader->vs_out_viewport = 1;
1696			output.swizzle_x = 7;
1697			output.swizzle_y = 7;
1698			output.swizzle_z = 7;
1699			output.swizzle_w = 0;
1700			break;
1701		case TGSI_SEMANTIC_CLIPDIST:
1702			/* spi_sid is 0 for clipdistance outputs that were generated
1703			 * for clipvertex - we don't need to pass them to PS */
1704			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1705			if (out->spi_sid) {
1706				/* duplicate it as PARAM to pass to the pixel shader */
1707				output.array_base = next_param++;
1708				r600_bytecode_add_output(ctx.bc, &output);
1709				last_exp_param = ctx.bc->cf_last;
1710			}
1711			output.array_base = next_clip_pos++;
1712			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1713			break;
1714		case TGSI_SEMANTIC_FOG:
1715			output.swizzle_y = 4; /* 0 */
1716			output.swizzle_z = 4; /* 0 */
1717			output.swizzle_w = 5; /* 1 */
1718			break;
1719		default:
1720			output.array_base = next_param++;
1721			break;
1722		}
1723		r600_bytecode_add_output(ctx.bc, &output);
1724		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1725			last_exp_param = ctx.bc->cf_last;
1726		else
1727			last_exp_pos = ctx.bc->cf_last;
1728	}
1729
1730	if (!last_exp_pos) {
1731		memset(&output, 0, sizeof(output));
1732		output.gpr = 0;
1733		output.elem_size = 3;
1734		output.swizzle_x = 7;
1735		output.swizzle_y = 7;
1736		output.swizzle_z = 7;
1737		output.swizzle_w = 7;
1738		output.burst_count = 1;
1739		output.type = 2;
1740		output.op = CF_OP_EXPORT;
1741		output.array_base = 60;
1742		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1743		r600_bytecode_add_output(ctx.bc, &output);
1744		last_exp_pos = ctx.bc->cf_last;
1745	}
1746
1747	if (!last_exp_param) {
1748		memset(&output, 0, sizeof(output));
1749		output.gpr = 0;
1750		output.elem_size = 3;
1751		output.swizzle_x = 7;
1752		output.swizzle_y = 7;
1753		output.swizzle_z = 7;
1754		output.swizzle_w = 7;
1755		output.burst_count = 1;
1756		output.type = 2;
1757		output.op = CF_OP_EXPORT;
1758		output.array_base = next_param++;
1759		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1760		r600_bytecode_add_output(ctx.bc, &output);
1761		last_exp_param = ctx.bc->cf_last;
1762	}
1763
1764	last_exp_pos->op = CF_OP_EXPORT_DONE;
1765	last_exp_param->op = CF_OP_EXPORT_DONE;
1766
1767	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1768	cf_pop = ctx.bc->cf_last;
1769
1770	cf_jump->cf_addr = cf_pop->id + 2;
1771	cf_jump->pop_count = 1;
1772	cf_pop->cf_addr = cf_pop->id + 2;
1773	cf_pop->pop_count = 1;
1774
1775	if (ctx.bc->chip_class == CAYMAN)
1776		cm_bytecode_add_cf_end(ctx.bc);
1777	else {
1778		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1779		ctx.bc->cf_last->end_of_program = 1;
1780	}
1781
1782	gs->gs_copy_shader = cshader;
1783	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1784
1785	ctx.bc->nstack = 1;
1786
1787	return r600_bytecode_build(ctx.bc);
1788}
1789
1790static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1791{
1792	struct r600_bytecode_output output;
1793	int i, k, ring_offset;
1794	int effective_stream = stream == -1 ? 0 : stream;
1795	int idx = 0;
1796
1797	for (i = 0; i < ctx->shader->noutput; i++) {
1798		if (ctx->gs_for_vs) {
1799			/* for ES we need to lookup corresponding ring offset expected by GS
1800			 * (map this output to GS input by name and sid) */
1801			/* FIXME precompute offsets */
1802			ring_offset = -1;
1803			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1804				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1805				struct r600_shader_io *out = &ctx->shader->output[i];
1806				if (in->name == out->name && in->sid == out->sid)
1807					ring_offset = in->ring_offset;
1808			}
1809
1810			if (ring_offset == -1)
1811				continue;
1812		} else {
1813			ring_offset = idx * 16;
1814			idx++;
1815		}
1816
1817		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
1818			continue;
1819		/* next_ring_offset after parsing input decls contains total size of
1820		 * single vertex data, gs_next_vertex - current vertex index */
1821		if (!ind)
1822			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1823
1824		memset(&output, 0, sizeof(struct r600_bytecode_output));
1825		output.gpr = ctx->shader->output[i].gpr;
1826		output.elem_size = 3;
1827		output.comp_mask = 0xF;
1828		output.burst_count = 1;
1829
1830		if (ind)
1831			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1832		else
1833			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1834
1835		switch (stream) {
1836		default:
1837		case 0:
1838			output.op = CF_OP_MEM_RING; break;
1839		case 1:
1840			output.op = CF_OP_MEM_RING1; break;
1841		case 2:
1842			output.op = CF_OP_MEM_RING2; break;
1843		case 3:
1844			output.op = CF_OP_MEM_RING3; break;
1845		}
1846
1847		if (ind) {
1848			output.array_base = ring_offset >> 2; /* in dwords */
1849			output.array_size = 0xfff;
1850			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
1851		} else
1852			output.array_base = ring_offset >> 2; /* in dwords */
1853		r600_bytecode_add_output(ctx->bc, &output);
1854	}
1855
1856	if (ind) {
1857		/* get a temp and add the ring offset to the next vertex base in the shader */
1858		struct r600_bytecode_alu alu;
1859		int r;
1860
1861		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1862		alu.op = ALU_OP2_ADD_INT;
1863		alu.src[0].sel = ctx->gs_export_gpr_tregs[effective_stream];
1864		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1865		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1866		alu.dst.sel = ctx->gs_export_gpr_tregs[effective_stream];
1867		alu.dst.write = 1;
1868		alu.last = 1;
1869		r = r600_bytecode_add_alu(ctx->bc, &alu);
1870		if (r)
1871			return r;
1872	}
1873	++ctx->gs_next_vertex;
1874	return 0;
1875}
1876
1877static int r600_shader_from_tgsi(struct r600_context *rctx,
1878				 struct r600_pipe_shader *pipeshader,
1879				 union r600_shader_key key)
1880{
1881	struct r600_screen *rscreen = rctx->screen;
1882	struct r600_shader *shader = &pipeshader->shader;
1883	struct tgsi_token *tokens = pipeshader->selector->tokens;
1884	struct pipe_stream_output_info so = pipeshader->selector->so;
1885	struct tgsi_full_immediate *immediate;
1886	struct r600_shader_ctx ctx;
1887	struct r600_bytecode_output output[32];
1888	unsigned output_done, noutput;
1889	unsigned opcode;
1890	int i, j, k, r = 0;
1891	int next_param_base = 0, next_clip_base;
1892	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
1893	/* Declarations used by llvm code */
1894	bool use_llvm = false;
1895	bool indirect_gprs;
1896	bool ring_outputs = false;
1897	bool pos_emitted = false;
1898
1899#ifdef R600_USE_LLVM
1900	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1901#endif
1902	ctx.bc = &shader->bc;
1903	ctx.shader = shader;
1904	ctx.native_integers = true;
1905
1906
1907	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1908			   rscreen->has_compressed_msaa_texturing);
1909	ctx.tokens = tokens;
1910	tgsi_scan_shader(tokens, &ctx.info);
1911	shader->indirect_files = ctx.info.indirect_files;
1912
1913	shader->uses_doubles = ctx.info.uses_doubles;
1914
1915	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
1916	tgsi_parse_init(&ctx.parse, tokens);
1917	ctx.type = ctx.info.processor;
1918	shader->processor_type = ctx.type;
1919	ctx.bc->type = shader->processor_type;
1920
1921	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1922		shader->vs_as_gs_a = key.vs.as_gs_a;
1923		shader->vs_as_es = key.vs.as_es;
1924	}
1925
1926	ring_outputs = shader->vs_as_es || ctx.type == TGSI_PROCESSOR_GEOMETRY;
1927
1928	if (shader->vs_as_es) {
1929		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1930	} else {
1931		ctx.gs_for_vs = NULL;
1932	}
1933
1934	ctx.next_ring_offset = 0;
1935	ctx.gs_out_ring_offset = 0;
1936	ctx.gs_next_vertex = 0;
1937	ctx.gs_stream_output_info = &so;
1938
1939	shader->uses_index_registers = false;
1940	ctx.face_gpr = -1;
1941	ctx.fixed_pt_position_gpr = -1;
1942	ctx.fragcoord_input = -1;
1943	ctx.colors_used = 0;
1944	ctx.clip_vertex_write = 0;
1945
1946	shader->nr_ps_color_exports = 0;
1947	shader->nr_ps_max_color_exports = 0;
1948
1949	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
1950		shader->two_side = key.ps.color_two_side;
1951
1952	/* register allocations */
1953	/* Values [0,127] correspond to GPR[0..127].
1954	 * Values [128,159] correspond to constant buffer bank 0
1955	 * Values [160,191] correspond to constant buffer bank 1
1956	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1957	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1958	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1959	 * Other special values are shown in the list below.
1960	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1961	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1962	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1963	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1964	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1965	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1966	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1967	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1968	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1969	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1970	 * 254	SQ_ALU_SRC_PV: previous vector result.
1971	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1972	 */
1973	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1974		ctx.file_offset[i] = 0;
1975	}
1976
1977#ifdef R600_USE_LLVM
1978	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1979		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1980				"indirect adressing.  Falling back to TGSI "
1981				"backend.\n");
1982		use_llvm = 0;
1983	}
1984#endif
1985	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1986		ctx.file_offset[TGSI_FILE_INPUT] = 1;
1987		if (!use_llvm) {
1988			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1989		}
1990	}
1991	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1992		if (ctx.bc->chip_class >= EVERGREEN)
1993			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1994		else
1995			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
1996	}
1997	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1998		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
1999		ctx.file_offset[TGSI_FILE_INPUT] = 2;
2000	}
2001	ctx.use_llvm = use_llvm;
2002
2003	if (use_llvm) {
2004		ctx.file_offset[TGSI_FILE_OUTPUT] =
2005			ctx.file_offset[TGSI_FILE_INPUT];
2006	} else {
2007	   ctx.file_offset[TGSI_FILE_OUTPUT] =
2008			ctx.file_offset[TGSI_FILE_INPUT] +
2009			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2010	}
2011	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2012						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2013
2014	/* Outside the GPR range. This will be translated to one of the
2015	 * kcache banks later. */
2016	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2017
2018	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2019	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2020			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2021	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2022	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2023
2024	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2025		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2026		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2027		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2028		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2029		ctx.temp_reg = ctx.bc->ar_reg + 7;
2030	} else {
2031		ctx.temp_reg = ctx.bc->ar_reg + 3;
2032	}
2033
2034	shader->max_arrays = 0;
2035	shader->num_arrays = 0;
2036	if (indirect_gprs) {
2037
2038		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2039			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2040			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
2041			                   ctx.file_offset[TGSI_FILE_INPUT],
2042			                   0x0F);
2043		}
2044		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2045			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2046			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
2047			                   ctx.file_offset[TGSI_FILE_OUTPUT],
2048			                   0x0F);
2049		}
2050	}
2051
2052	ctx.nliterals = 0;
2053	ctx.literals = NULL;
2054
2055	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2056	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2057
2058	if (shader->vs_as_gs_a)
2059		vs_add_primid_output(&ctx, key.vs.prim_id_out);
2060
2061	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2062		tgsi_parse_token(&ctx.parse);
2063		switch (ctx.parse.FullToken.Token.Type) {
2064		case TGSI_TOKEN_TYPE_IMMEDIATE:
2065			immediate = &ctx.parse.FullToken.FullImmediate;
2066			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2067			if(ctx.literals == NULL) {
2068				r = -ENOMEM;
2069				goto out_err;
2070			}
2071			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2072			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2073			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2074			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2075			ctx.nliterals++;
2076			break;
2077		case TGSI_TOKEN_TYPE_DECLARATION:
2078			r = tgsi_declaration(&ctx);
2079			if (r)
2080				goto out_err;
2081			break;
2082		case TGSI_TOKEN_TYPE_INSTRUCTION:
2083		case TGSI_TOKEN_TYPE_PROPERTY:
2084			break;
2085		default:
2086			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2087			r = -EINVAL;
2088			goto out_err;
2089		}
2090	}
2091
2092	shader->ring_item_sizes[0] = ctx.next_ring_offset;
2093	shader->ring_item_sizes[1] = 0;
2094	shader->ring_item_sizes[2] = 0;
2095	shader->ring_item_sizes[3] = 0;
2096
2097	/* Process two side if needed */
2098	if (shader->two_side && ctx.colors_used) {
2099		int i, count = ctx.shader->ninput;
2100		unsigned next_lds_loc = ctx.shader->nlds;
2101
2102		/* additional inputs will be allocated right after the existing inputs,
2103		 * we won't need them after the color selection, so we don't need to
2104		 * reserve these gprs for the rest of the shader code and to adjust
2105		 * output offsets etc. */
2106		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2107				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2108
2109		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2110		if (ctx.face_gpr == -1) {
2111			i = ctx.shader->ninput++;
2112			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2113			ctx.shader->input[i].spi_sid = 0;
2114			ctx.shader->input[i].gpr = gpr++;
2115			ctx.face_gpr = ctx.shader->input[i].gpr;
2116		}
2117
2118		for (i = 0; i < count; i++) {
2119			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2120				int ni = ctx.shader->ninput++;
2121				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2122				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2123				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2124				ctx.shader->input[ni].gpr = gpr++;
2125				// TGSI to LLVM needs to know the lds position of inputs.
2126				// Non LLVM path computes it later (in process_twoside_color)
2127				ctx.shader->input[ni].lds_pos = next_lds_loc++;
2128				ctx.shader->input[i].back_color_input = ni;
2129				if (ctx.bc->chip_class >= EVERGREEN) {
2130					if ((r = evergreen_interp_input(&ctx, ni)))
2131						return r;
2132				}
2133			}
2134		}
2135	}
2136
2137/* LLVM backend setup */
2138#ifdef R600_USE_LLVM
2139	if (use_llvm) {
2140		struct radeon_llvm_context radeon_llvm_ctx;
2141		LLVMModuleRef mod;
2142		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2143		boolean use_kill = false;
2144
2145		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2146		radeon_llvm_ctx.type = ctx.type;
2147		radeon_llvm_ctx.two_side = shader->two_side;
2148		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2149		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2150		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2151		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2152		radeon_llvm_ctx.color_buffer_count = max_color_exports;
2153		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2154		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2155		radeon_llvm_ctx.stream_outputs = &so;
2156		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2157		radeon_llvm_ctx.has_compressed_msaa_texturing =
2158			ctx.bc->has_compressed_msaa_texturing;
2159		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2160		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2161		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2162
2163		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2164			radeon_llvm_dispose(&radeon_llvm_ctx);
2165			use_llvm = 0;
2166			fprintf(stderr, "R600 LLVM backend failed to compile "
2167				"shader.  Falling back to TGSI\n");
2168		} else {
2169			ctx.file_offset[TGSI_FILE_OUTPUT] =
2170					ctx.file_offset[TGSI_FILE_INPUT];
2171		}
2172		if (use_kill)
2173			ctx.shader->uses_kill = use_kill;
2174		radeon_llvm_dispose(&radeon_llvm_ctx);
2175	}
2176#endif
2177/* End of LLVM backend setup */
2178
2179	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2180		shader->nr_ps_max_color_exports = 8;
2181
2182	if (!use_llvm) {
2183		if (ctx.fragcoord_input >= 0) {
2184			if (ctx.bc->chip_class == CAYMAN) {
2185				for (j = 0 ; j < 4; j++) {
2186					struct r600_bytecode_alu alu;
2187					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2188					alu.op = ALU_OP1_RECIP_IEEE;
2189					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2190					alu.src[0].chan = 3;
2191
2192					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2193					alu.dst.chan = j;
2194					alu.dst.write = (j == 3);
2195					alu.last = 1;
2196					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2197						return r;
2198				}
2199			} else {
2200				struct r600_bytecode_alu alu;
2201				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2202				alu.op = ALU_OP1_RECIP_IEEE;
2203				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2204				alu.src[0].chan = 3;
2205
2206				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2207				alu.dst.chan = 3;
2208				alu.dst.write = 1;
2209				alu.last = 1;
2210				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2211					return r;
2212			}
2213		}
2214
2215		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2216			struct r600_bytecode_alu alu;
2217			int r;
2218			for (j = 0; j < 4; j++) {
2219				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2220				alu.op = ALU_OP1_MOV;
2221				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2222				alu.src[0].value = 0;
2223				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2224				alu.dst.write = 1;
2225				alu.last = 1;
2226				r = r600_bytecode_add_alu(ctx.bc, &alu);
2227				if (r)
2228					return r;
2229			}
2230		}
2231		if (shader->two_side && ctx.colors_used) {
2232			if ((r = process_twoside_color_inputs(&ctx)))
2233				return r;
2234		}
2235
2236		tgsi_parse_init(&ctx.parse, tokens);
2237		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2238			tgsi_parse_token(&ctx.parse);
2239			switch (ctx.parse.FullToken.Token.Type) {
2240			case TGSI_TOKEN_TYPE_INSTRUCTION:
2241				r = tgsi_is_supported(&ctx);
2242				if (r)
2243					goto out_err;
2244				ctx.max_driver_temp_used = 0;
2245				/* reserve first tmp for everyone */
2246				r600_get_temp(&ctx);
2247
2248				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2249				if ((r = tgsi_split_constant(&ctx)))
2250					goto out_err;
2251				if ((r = tgsi_split_literal_constant(&ctx)))
2252					goto out_err;
2253				if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2254					if ((r = tgsi_split_gs_inputs(&ctx)))
2255						goto out_err;
2256				if (ctx.bc->chip_class == CAYMAN)
2257					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2258				else if (ctx.bc->chip_class >= EVERGREEN)
2259					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2260				else
2261					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2262				r = ctx.inst_info->process(&ctx);
2263				if (r)
2264					goto out_err;
2265				break;
2266			default:
2267				break;
2268			}
2269		}
2270	}
2271
2272	/* Reset the temporary register counter. */
2273	ctx.max_driver_temp_used = 0;
2274
2275	noutput = shader->noutput;
2276
2277	if (!ring_outputs && ctx.clip_vertex_write) {
2278		unsigned clipdist_temp[2];
2279
2280		clipdist_temp[0] = r600_get_temp(&ctx);
2281		clipdist_temp[1] = r600_get_temp(&ctx);
2282
2283		/* need to convert a clipvertex write into clipdistance writes and not export
2284		   the clip vertex anymore */
2285
2286		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2287		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2288		shader->output[noutput].gpr = clipdist_temp[0];
2289		noutput++;
2290		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2291		shader->output[noutput].gpr = clipdist_temp[1];
2292		noutput++;
2293
2294		/* reset spi_sid for clipvertex output to avoid confusing spi */
2295		shader->output[ctx.cv_output].spi_sid = 0;
2296
2297		shader->clip_dist_write = 0xFF;
2298
2299		for (i = 0; i < 8; i++) {
2300			int oreg = i >> 2;
2301			int ochan = i & 3;
2302
2303			for (j = 0; j < 4; j++) {
2304				struct r600_bytecode_alu alu;
2305				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2306				alu.op = ALU_OP2_DOT4;
2307				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2308				alu.src[0].chan = j;
2309
2310				alu.src[1].sel = 512 + i;
2311				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2312				alu.src[1].chan = j;
2313
2314				alu.dst.sel = clipdist_temp[oreg];
2315				alu.dst.chan = j;
2316				alu.dst.write = (j == ochan);
2317				if (j == 3)
2318					alu.last = 1;
2319				if (!use_llvm)
2320					r = r600_bytecode_add_alu(ctx.bc, &alu);
2321				if (r)
2322					return r;
2323			}
2324		}
2325	}
2326
2327	/* Add stream outputs. */
2328	if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2329	    so.num_outputs && !use_llvm)
2330		emit_streamout(&ctx, &so, -1, NULL);
2331
2332	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2333	convert_edgeflag_to_int(&ctx);
2334
2335	if (ring_outputs) {
2336		if (shader->vs_as_es) {
2337			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2338			ctx.gs_export_gpr_tregs[1] = -1;
2339			ctx.gs_export_gpr_tregs[2] = -1;
2340			ctx.gs_export_gpr_tregs[3] = -1;
2341
2342			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2343		}
2344	} else {
2345		/* Export output */
2346		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2347
2348		for (i = 0, j = 0; i < noutput; i++, j++) {
2349			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2350			output[j].gpr = shader->output[i].gpr;
2351			output[j].elem_size = 3;
2352			output[j].swizzle_x = 0;
2353			output[j].swizzle_y = 1;
2354			output[j].swizzle_z = 2;
2355			output[j].swizzle_w = 3;
2356			output[j].burst_count = 1;
2357			output[j].type = -1;
2358			output[j].op = CF_OP_EXPORT;
2359			switch (ctx.type) {
2360			case TGSI_PROCESSOR_VERTEX:
2361				switch (shader->output[i].name) {
2362				case TGSI_SEMANTIC_POSITION:
2363					output[j].array_base = 60;
2364					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2365					pos_emitted = true;
2366					break;
2367
2368				case TGSI_SEMANTIC_PSIZE:
2369					output[j].array_base = 61;
2370					output[j].swizzle_y = 7;
2371					output[j].swizzle_z = 7;
2372					output[j].swizzle_w = 7;
2373					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2374					pos_emitted = true;
2375					break;
2376				case TGSI_SEMANTIC_EDGEFLAG:
2377					output[j].array_base = 61;
2378					output[j].swizzle_x = 7;
2379					output[j].swizzle_y = 0;
2380					output[j].swizzle_z = 7;
2381					output[j].swizzle_w = 7;
2382					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2383					pos_emitted = true;
2384					break;
2385				case TGSI_SEMANTIC_LAYER:
2386					/* spi_sid is 0 for outputs that are
2387					 * not consumed by PS */
2388					if (shader->output[i].spi_sid) {
2389						output[j].array_base = next_param_base++;
2390						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2391						j++;
2392						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2393					}
2394					output[j].array_base = 61;
2395					output[j].swizzle_x = 7;
2396					output[j].swizzle_y = 7;
2397					output[j].swizzle_z = 0;
2398					output[j].swizzle_w = 7;
2399					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2400					pos_emitted = true;
2401					break;
2402				case TGSI_SEMANTIC_VIEWPORT_INDEX:
2403					/* spi_sid is 0 for outputs that are
2404					 * not consumed by PS */
2405					if (shader->output[i].spi_sid) {
2406						output[j].array_base = next_param_base++;
2407						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2408						j++;
2409						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2410					}
2411					output[j].array_base = 61;
2412					output[j].swizzle_x = 7;
2413					output[j].swizzle_y = 7;
2414					output[j].swizzle_z = 7;
2415					output[j].swizzle_w = 0;
2416					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2417					pos_emitted = true;
2418					break;
2419				case TGSI_SEMANTIC_CLIPVERTEX:
2420					j--;
2421					break;
2422				case TGSI_SEMANTIC_CLIPDIST:
2423					output[j].array_base = next_clip_base++;
2424					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2425					pos_emitted = true;
2426					/* spi_sid is 0 for clipdistance outputs that were generated
2427					 * for clipvertex - we don't need to pass them to PS */
2428					if (shader->output[i].spi_sid) {
2429						j++;
2430						/* duplicate it as PARAM to pass to the pixel shader */
2431						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2432						output[j].array_base = next_param_base++;
2433						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2434					}
2435					break;
2436				case TGSI_SEMANTIC_FOG:
2437					output[j].swizzle_y = 4; /* 0 */
2438					output[j].swizzle_z = 4; /* 0 */
2439					output[j].swizzle_w = 5; /* 1 */
2440					break;
2441				case TGSI_SEMANTIC_PRIMID:
2442					output[j].swizzle_x = 2;
2443					output[j].swizzle_y = 4; /* 0 */
2444					output[j].swizzle_z = 4; /* 0 */
2445					output[j].swizzle_w = 4; /* 0 */
2446					break;
2447				}
2448
2449				break;
2450			case TGSI_PROCESSOR_FRAGMENT:
2451				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2452					/* never export more colors than the number of CBs */
2453					if (shader->output[i].sid >= max_color_exports) {
2454						/* skip export */
2455						j--;
2456						continue;
2457					}
2458					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2459					output[j].array_base = shader->output[i].sid;
2460					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2461					shader->nr_ps_color_exports++;
2462					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2463						for (k = 1; k < max_color_exports; k++) {
2464							j++;
2465							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2466							output[j].gpr = shader->output[i].gpr;
2467							output[j].elem_size = 3;
2468							output[j].swizzle_x = 0;
2469							output[j].swizzle_y = 1;
2470							output[j].swizzle_z = 2;
2471							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2472							output[j].burst_count = 1;
2473							output[j].array_base = k;
2474							output[j].op = CF_OP_EXPORT;
2475							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2476							shader->nr_ps_color_exports++;
2477						}
2478					}
2479				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2480					output[j].array_base = 61;
2481					output[j].swizzle_x = 2;
2482					output[j].swizzle_y = 7;
2483					output[j].swizzle_z = output[j].swizzle_w = 7;
2484					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2485				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2486					output[j].array_base = 61;
2487					output[j].swizzle_x = 7;
2488					output[j].swizzle_y = 1;
2489					output[j].swizzle_z = output[j].swizzle_w = 7;
2490					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2491				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2492					output[j].array_base = 61;
2493					output[j].swizzle_x = 7;
2494					output[j].swizzle_y = 7;
2495					output[j].swizzle_z = 0;
2496					output[j].swizzle_w = 7;
2497					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2498				} else {
2499					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2500					r = -EINVAL;
2501					goto out_err;
2502				}
2503				break;
2504			default:
2505				R600_ERR("unsupported processor type %d\n", ctx.type);
2506				r = -EINVAL;
2507				goto out_err;
2508			}
2509
2510			if (output[j].type==-1) {
2511				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2512				output[j].array_base = next_param_base++;
2513			}
2514		}
2515
2516		/* add fake position export */
2517		if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2518			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2519			output[j].gpr = 0;
2520			output[j].elem_size = 3;
2521			output[j].swizzle_x = 7;
2522			output[j].swizzle_y = 7;
2523			output[j].swizzle_z = 7;
2524			output[j].swizzle_w = 7;
2525			output[j].burst_count = 1;
2526			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2527			output[j].array_base = 60;
2528			output[j].op = CF_OP_EXPORT;
2529			j++;
2530		}
2531
2532		/* add fake param output for vertex shader if no param is exported */
2533		if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2534			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2535			output[j].gpr = 0;
2536			output[j].elem_size = 3;
2537			output[j].swizzle_x = 7;
2538			output[j].swizzle_y = 7;
2539			output[j].swizzle_z = 7;
2540			output[j].swizzle_w = 7;
2541			output[j].burst_count = 1;
2542			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2543			output[j].array_base = 0;
2544			output[j].op = CF_OP_EXPORT;
2545			j++;
2546		}
2547
2548		/* add fake pixel export */
2549		if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2550			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2551			output[j].gpr = 0;
2552			output[j].elem_size = 3;
2553			output[j].swizzle_x = 7;
2554			output[j].swizzle_y = 7;
2555			output[j].swizzle_z = 7;
2556			output[j].swizzle_w = 7;
2557			output[j].burst_count = 1;
2558			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2559			output[j].array_base = 0;
2560			output[j].op = CF_OP_EXPORT;
2561			j++;
2562			shader->nr_ps_color_exports++;
2563		}
2564
2565		noutput = j;
2566
2567		/* set export done on last export of each type */
2568		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2569			if (!(output_done & (1 << output[i].type))) {
2570				output_done |= (1 << output[i].type);
2571				output[i].op = CF_OP_EXPORT_DONE;
2572			}
2573		}
2574		/* add output to bytecode */
2575		if (!use_llvm) {
2576			for (i = 0; i < noutput; i++) {
2577				r = r600_bytecode_add_output(ctx.bc, &output[i]);
2578				if (r)
2579					goto out_err;
2580			}
2581		}
2582	}
2583
2584	/* add program end */
2585	if (!use_llvm) {
2586		if (ctx.bc->chip_class == CAYMAN)
2587			cm_bytecode_add_cf_end(ctx.bc);
2588		else {
2589			const struct cf_op_info *last = NULL;
2590
2591			if (ctx.bc->cf_last)
2592				last = r600_isa_cf(ctx.bc->cf_last->op);
2593
2594			/* alu clause instructions don't have EOP bit, so add NOP */
2595			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2596				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2597
2598			ctx.bc->cf_last->end_of_program = 1;
2599		}
2600	}
2601
2602	/* check GPR limit - we have 124 = 128 - 4
2603	 * (4 are reserved as alu clause temporary registers) */
2604	if (ctx.bc->ngpr > 124) {
2605		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2606		r = -ENOMEM;
2607		goto out_err;
2608	}
2609
2610	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2611		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2612			return r;
2613	}
2614
2615	free(ctx.literals);
2616	tgsi_parse_free(&ctx.parse);
2617	return 0;
2618out_err:
2619	free(ctx.literals);
2620	tgsi_parse_free(&ctx.parse);
2621	return r;
2622}
2623
2624static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2625{
2626	const unsigned tgsi_opcode =
2627		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2628	R600_ERR("%s tgsi opcode unsupported\n",
2629		 tgsi_get_opcode_name(tgsi_opcode));
2630	return -EINVAL;
2631}
2632
2633static int tgsi_end(struct r600_shader_ctx *ctx)
2634{
2635	return 0;
2636}
2637
2638static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2639			const struct r600_shader_src *shader_src,
2640			unsigned chan)
2641{
2642	bc_src->sel = shader_src->sel;
2643	bc_src->chan = shader_src->swizzle[chan];
2644	bc_src->neg = shader_src->neg;
2645	bc_src->abs = shader_src->abs;
2646	bc_src->rel = shader_src->rel;
2647	bc_src->value = shader_src->value[bc_src->chan];
2648	bc_src->kc_bank = shader_src->kc_bank;
2649	bc_src->kc_rel = shader_src->kc_rel;
2650}
2651
2652static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2653{
2654	bc_src->abs = 1;
2655	bc_src->neg = 0;
2656}
2657
2658static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2659{
2660	bc_src->neg = !bc_src->neg;
2661}
2662
2663static void tgsi_dst(struct r600_shader_ctx *ctx,
2664		     const struct tgsi_full_dst_register *tgsi_dst,
2665		     unsigned swizzle,
2666		     struct r600_bytecode_alu_dst *r600_dst)
2667{
2668	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2669
2670	r600_dst->sel = tgsi_dst->Register.Index;
2671	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2672	r600_dst->chan = swizzle;
2673	r600_dst->write = 1;
2674	if (tgsi_dst->Register.Indirect)
2675		r600_dst->rel = V_SQ_REL_RELATIVE;
2676	if (inst->Instruction.Saturate) {
2677		r600_dst->clamp = 1;
2678	}
2679}
2680
2681static int tgsi_last_instruction(unsigned writemask)
2682{
2683	int i, lasti = 0;
2684
2685	for (i = 0; i < 4; i++) {
2686		if (writemask & (1 << i)) {
2687			lasti = i;
2688		}
2689	}
2690	return lasti;
2691}
2692
2693
2694
2695static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2696{
2697	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2698	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2699	struct r600_bytecode_alu alu;
2700	int i, j, r, lasti = tgsi_last_instruction(write_mask);
2701	int use_tmp = 0;
2702
2703	if (singledest) {
2704		switch (write_mask) {
2705		case 0x1:
2706			write_mask = 0x3;
2707			break;
2708		case 0x2:
2709			use_tmp = 1;
2710			write_mask = 0x3;
2711			break;
2712		case 0x4:
2713			write_mask = 0xc;
2714			break;
2715		case 0x8:
2716			write_mask = 0xc;
2717			use_tmp = 3;
2718			break;
2719		}
2720	}
2721
2722	lasti = tgsi_last_instruction(write_mask);
2723	for (i = 0; i <= lasti; i++) {
2724
2725		if (!(write_mask & (1 << i)))
2726			continue;
2727
2728		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2729
2730		if (singledest) {
2731			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2732			if (use_tmp) {
2733				alu.dst.sel = ctx->temp_reg;
2734				alu.dst.chan = i;
2735				alu.dst.write = 1;
2736			}
2737			if (i == 1 || i == 3)
2738				alu.dst.write = 0;
2739		} else
2740			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2741
2742		alu.op = ctx->inst_info->op;
2743		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
2744			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2745		} else if (!swap) {
2746			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2747				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
2748			}
2749		} else {
2750			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
2751			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
2752		}
2753
2754		/* handle some special cases */
2755		if (i == 1 || i == 3) {
2756			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
2757			case TGSI_OPCODE_SUB:
2758				r600_bytecode_src_toggle_neg(&alu.src[1]);
2759				break;
2760			case TGSI_OPCODE_DABS:
2761				r600_bytecode_src_set_abs(&alu.src[0]);
2762				break;
2763			default:
2764				break;
2765			}
2766		}
2767		if (i == lasti) {
2768			alu.last = 1;
2769		}
2770		r = r600_bytecode_add_alu(ctx->bc, &alu);
2771		if (r)
2772			return r;
2773	}
2774
2775	if (use_tmp) {
2776		write_mask = inst->Dst[0].Register.WriteMask;
2777
2778		/* move result from temp to dst */
2779		for (i = 0; i <= lasti; i++) {
2780			if (!(write_mask & (1 << i)))
2781				continue;
2782
2783			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2784			alu.op = ALU_OP1_MOV;
2785			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2786			alu.src[0].sel = ctx->temp_reg;
2787			alu.src[0].chan = use_tmp - 1;
2788			alu.last = (i == lasti);
2789
2790			r = r600_bytecode_add_alu(ctx->bc, &alu);
2791			if (r)
2792				return r;
2793		}
2794	}
2795	return 0;
2796}
2797
2798static int tgsi_op2_64(struct r600_shader_ctx *ctx)
2799{
2800	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2801	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2802	/* confirm writemasking */
2803	if ((write_mask & 0x3) != 0x3 &&
2804	    (write_mask & 0xc) != 0xc) {
2805		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
2806		return -1;
2807	}
2808	return tgsi_op2_64_params(ctx, false, false);
2809}
2810
2811static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
2812{
2813	return tgsi_op2_64_params(ctx, true, false);
2814}
2815
2816static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
2817{
2818	return tgsi_op2_64_params(ctx, true, true);
2819}
2820
2821static int tgsi_op3_64(struct r600_shader_ctx *ctx)
2822{
2823	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2824	struct r600_bytecode_alu alu;
2825	int i, j, r;
2826	int lasti = 3;
2827	int tmp = r600_get_temp(ctx);
2828
2829	for (i = 0; i < lasti + 1; i++) {
2830
2831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2832		alu.op = ctx->inst_info->op;
2833		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2834			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
2835		}
2836
2837		if (inst->Dst[0].Register.WriteMask & (1 << i))
2838			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2839		else
2840			alu.dst.sel = tmp;
2841
2842		alu.dst.chan = i;
2843		alu.is_op3 = 1;
2844		if (i == lasti) {
2845			alu.last = 1;
2846		}
2847		r = r600_bytecode_add_alu(ctx->bc, &alu);
2848		if (r)
2849			return r;
2850	}
2851	return 0;
2852}
2853
2854static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2855{
2856	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2857	struct r600_bytecode_alu alu;
2858	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2859	int i, j, r, lasti = tgsi_last_instruction(write_mask);
2860	/* use temp register if trans_only and more than one dst component */
2861	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2862
2863	for (i = 0; i <= lasti; i++) {
2864		if (!(write_mask & (1 << i)))
2865			continue;
2866
2867		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2868		if (use_tmp) {
2869			alu.dst.sel = ctx->temp_reg;
2870			alu.dst.chan = i;
2871			alu.dst.write = 1;
2872		} else
2873			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2874
2875		alu.op = ctx->inst_info->op;
2876		if (!swap) {
2877			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2878				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2879			}
2880		} else {
2881			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2882			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2883		}
2884		/* handle some special cases */
2885		switch (inst->Instruction.Opcode) {
2886		case TGSI_OPCODE_SUB:
2887			r600_bytecode_src_toggle_neg(&alu.src[1]);
2888			break;
2889		case TGSI_OPCODE_ABS:
2890			r600_bytecode_src_set_abs(&alu.src[0]);
2891			break;
2892		default:
2893			break;
2894		}
2895		if (i == lasti || trans_only) {
2896			alu.last = 1;
2897		}
2898		r = r600_bytecode_add_alu(ctx->bc, &alu);
2899		if (r)
2900			return r;
2901	}
2902
2903	if (use_tmp) {
2904		/* move result from temp to dst */
2905		for (i = 0; i <= lasti; i++) {
2906			if (!(write_mask & (1 << i)))
2907				continue;
2908
2909			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2910			alu.op = ALU_OP1_MOV;
2911			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2912			alu.src[0].sel = ctx->temp_reg;
2913			alu.src[0].chan = i;
2914			alu.last = (i == lasti);
2915
2916			r = r600_bytecode_add_alu(ctx->bc, &alu);
2917			if (r)
2918				return r;
2919		}
2920	}
2921	return 0;
2922}
2923
2924static int tgsi_op2(struct r600_shader_ctx *ctx)
2925{
2926	return tgsi_op2_s(ctx, 0, 0);
2927}
2928
2929static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2930{
2931	return tgsi_op2_s(ctx, 1, 0);
2932}
2933
2934static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2935{
2936	return tgsi_op2_s(ctx, 0, 1);
2937}
2938
2939static int tgsi_ineg(struct r600_shader_ctx *ctx)
2940{
2941	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2942	struct r600_bytecode_alu alu;
2943	int i, r;
2944	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2945
2946	for (i = 0; i < lasti + 1; i++) {
2947
2948		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2949			continue;
2950		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2951		alu.op = ctx->inst_info->op;
2952
2953		alu.src[0].sel = V_SQ_ALU_SRC_0;
2954
2955		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2956
2957		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2958
2959		if (i == lasti) {
2960			alu.last = 1;
2961		}
2962		r = r600_bytecode_add_alu(ctx->bc, &alu);
2963		if (r)
2964			return r;
2965	}
2966	return 0;
2967
2968}
2969
2970static int tgsi_dneg(struct r600_shader_ctx *ctx)
2971{
2972	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2973	struct r600_bytecode_alu alu;
2974	int i, r;
2975	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2976
2977	for (i = 0; i < lasti + 1; i++) {
2978
2979		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2980			continue;
2981		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2982		alu.op = ALU_OP1_MOV;
2983
2984		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2985
2986		if (i == 1 || i == 3)
2987			r600_bytecode_src_toggle_neg(&alu.src[0]);
2988		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2989
2990		if (i == lasti) {
2991			alu.last = 1;
2992		}
2993		r = r600_bytecode_add_alu(ctx->bc, &alu);
2994		if (r)
2995			return r;
2996	}
2997	return 0;
2998
2999}
3000
3001static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3002{
3003	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3004	struct r600_bytecode_alu alu;
3005	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3006	int i, j, r;
3007	int firsti = write_mask == 0xc ? 2 : 0;
3008
3009	for (i = 0; i <= 3; i++) {
3010		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3011		alu.op = ctx->inst_info->op;
3012
3013		alu.dst.sel = ctx->temp_reg;
3014		alu.dst.chan = i;
3015		alu.dst.write = 1;
3016		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3017			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3018		}
3019
3020		if (i == 3)
3021			alu.last = 1;
3022
3023		r = r600_bytecode_add_alu(ctx->bc, &alu);
3024		if (r)
3025			return r;
3026	}
3027
3028	/* MOV first two channels to writemask dst0 */
3029	for (i = 0; i <= 1; i++) {
3030		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3031		alu.op = ALU_OP1_MOV;
3032		alu.src[0].chan = i + 2;
3033		alu.src[0].sel = ctx->temp_reg;
3034
3035		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3036		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3037		alu.last = 1;
3038		r = r600_bytecode_add_alu(ctx->bc, &alu);
3039		if (r)
3040			return r;
3041	}
3042
3043	for (i = 0; i <= 3; i++) {
3044		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3045			/* MOV third channels to writemask dst1 */
3046			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3047			alu.op = ALU_OP1_MOV;
3048			alu.src[0].chan = 1;
3049			alu.src[0].sel = ctx->temp_reg;
3050
3051			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3052			alu.last = 1;
3053			r = r600_bytecode_add_alu(ctx->bc, &alu);
3054			if (r)
3055				return r;
3056			break;
3057		}
3058	}
3059	return 0;
3060}
3061
3062static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3063{
3064	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3065	int i, r;
3066	struct r600_bytecode_alu alu;
3067	int last_slot = 3;
3068	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3069	int t1 = ctx->temp_reg;
3070
3071	/* these have to write the result to X/Y by the looks of it */
3072	for (i = 0 ; i < last_slot; i++) {
3073		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3074		alu.op = ctx->inst_info->op;
3075
3076		/* should only be one src regs */
3077		assert (inst->Instruction.NumSrcRegs == 1);
3078
3079		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3080		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3081
3082		/* RSQ should take the absolute value of src */
3083		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3084		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3085			r600_bytecode_src_set_abs(&alu.src[1]);
3086		}
3087		alu.dst.sel = t1;
3088		alu.dst.chan = i;
3089		alu.dst.write = (i == 0 || i == 1);
3090
3091		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3092			alu.last = 1;
3093		r = r600_bytecode_add_alu(ctx->bc, &alu);
3094		if (r)
3095			return r;
3096	}
3097
3098	for (i = 0 ; i <= lasti; i++) {
3099		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3100			continue;
3101		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3102		alu.op = ALU_OP1_MOV;
3103		alu.src[0].sel = t1;
3104		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3105		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3106		alu.dst.write = 1;
3107		if (i == lasti)
3108			alu.last = 1;
3109		r = r600_bytecode_add_alu(ctx->bc, &alu);
3110		if (r)
3111			return r;
3112	}
3113	return 0;
3114}
3115
3116static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3117{
3118	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3119	int i, j, r;
3120	struct r600_bytecode_alu alu;
3121	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3122
3123	for (i = 0 ; i < last_slot; i++) {
3124		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3125		alu.op = ctx->inst_info->op;
3126		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3127			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3128
3129			/* RSQ should take the absolute value of src */
3130			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3131				r600_bytecode_src_set_abs(&alu.src[j]);
3132			}
3133		}
3134		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3135		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3136
3137		if (i == last_slot - 1)
3138			alu.last = 1;
3139		r = r600_bytecode_add_alu(ctx->bc, &alu);
3140		if (r)
3141			return r;
3142	}
3143	return 0;
3144}
3145
3146static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3147{
3148	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3149	int i, j, k, r;
3150	struct r600_bytecode_alu alu;
3151	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3152	int t1 = ctx->temp_reg;
3153
3154	for (k = 0; k <= lasti; k++) {
3155		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3156			continue;
3157
3158		for (i = 0 ; i < 4; i++) {
3159			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3160			alu.op = ctx->inst_info->op;
3161			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3162				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3163			}
3164			alu.dst.sel = t1;
3165			alu.dst.chan = i;
3166			alu.dst.write = (i == k);
3167			if (i == 3)
3168				alu.last = 1;
3169			r = r600_bytecode_add_alu(ctx->bc, &alu);
3170			if (r)
3171				return r;
3172		}
3173	}
3174
3175	for (i = 0 ; i <= lasti; i++) {
3176		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3177			continue;
3178		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3179		alu.op = ALU_OP1_MOV;
3180		alu.src[0].sel = t1;
3181		alu.src[0].chan = i;
3182		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3183		alu.dst.write = 1;
3184		if (i == lasti)
3185			alu.last = 1;
3186		r = r600_bytecode_add_alu(ctx->bc, &alu);
3187		if (r)
3188			return r;
3189	}
3190
3191	return 0;
3192}
3193
3194
3195static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3196{
3197	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3198	int i, j, k, r;
3199	struct r600_bytecode_alu alu;
3200	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3201	int t1 = ctx->temp_reg;
3202
3203	for (k = 0; k < 2; k++) {
3204		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3205			continue;
3206
3207		for (i = 0; i < 4; i++) {
3208			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3209			alu.op = ctx->inst_info->op;
3210			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3211				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3212			}
3213			alu.dst.sel = t1;
3214			alu.dst.chan = i;
3215			alu.dst.write = 1;
3216			if (i == 3)
3217				alu.last = 1;
3218			r = r600_bytecode_add_alu(ctx->bc, &alu);
3219			if (r)
3220				return r;
3221		}
3222	}
3223
3224	for (i = 0; i <= lasti; i++) {
3225		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3226			continue;
3227		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3228		alu.op = ALU_OP1_MOV;
3229		alu.src[0].sel = t1;
3230		alu.src[0].chan = i;
3231		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3232		alu.dst.write = 1;
3233		if (i == lasti)
3234			alu.last = 1;
3235		r = r600_bytecode_add_alu(ctx->bc, &alu);
3236		if (r)
3237			return r;
3238	}
3239
3240	return 0;
3241}
3242
3243/*
3244 * r600 - trunc to -PI..PI range
3245 * r700 - normalize by dividing by 2PI
3246 * see fdo bug 27901
3247 */
3248static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3249{
3250	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3251	static float double_pi = 3.1415926535 * 2;
3252	static float neg_pi = -3.1415926535;
3253
3254	int r;
3255	struct r600_bytecode_alu alu;
3256
3257	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3258	alu.op = ALU_OP3_MULADD;
3259	alu.is_op3 = 1;
3260
3261	alu.dst.chan = 0;
3262	alu.dst.sel = ctx->temp_reg;
3263	alu.dst.write = 1;
3264
3265	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3266
3267	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3268	alu.src[1].chan = 0;
3269	alu.src[1].value = *(uint32_t *)&half_inv_pi;
3270	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3271	alu.src[2].chan = 0;
3272	alu.last = 1;
3273	r = r600_bytecode_add_alu(ctx->bc, &alu);
3274	if (r)
3275		return r;
3276
3277	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3278	alu.op = ALU_OP1_FRACT;
3279
3280	alu.dst.chan = 0;
3281	alu.dst.sel = ctx->temp_reg;
3282	alu.dst.write = 1;
3283
3284	alu.src[0].sel = ctx->temp_reg;
3285	alu.src[0].chan = 0;
3286	alu.last = 1;
3287	r = r600_bytecode_add_alu(ctx->bc, &alu);
3288	if (r)
3289		return r;
3290
3291	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3292	alu.op = ALU_OP3_MULADD;
3293	alu.is_op3 = 1;
3294
3295	alu.dst.chan = 0;
3296	alu.dst.sel = ctx->temp_reg;
3297	alu.dst.write = 1;
3298
3299	alu.src[0].sel = ctx->temp_reg;
3300	alu.src[0].chan = 0;
3301
3302	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3303	alu.src[1].chan = 0;
3304	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3305	alu.src[2].chan = 0;
3306
3307	if (ctx->bc->chip_class == R600) {
3308		alu.src[1].value = *(uint32_t *)&double_pi;
3309		alu.src[2].value = *(uint32_t *)&neg_pi;
3310	} else {
3311		alu.src[1].sel = V_SQ_ALU_SRC_1;
3312		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3313		alu.src[2].neg = 1;
3314	}
3315
3316	alu.last = 1;
3317	r = r600_bytecode_add_alu(ctx->bc, &alu);
3318	if (r)
3319		return r;
3320	return 0;
3321}
3322
3323static int cayman_trig(struct r600_shader_ctx *ctx)
3324{
3325	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3326	struct r600_bytecode_alu alu;
3327	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3328	int i, r;
3329
3330	r = tgsi_setup_trig(ctx);
3331	if (r)
3332		return r;
3333
3334
3335	for (i = 0; i < last_slot; i++) {
3336		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3337		alu.op = ctx->inst_info->op;
3338		alu.dst.chan = i;
3339
3340		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3341		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3342
3343		alu.src[0].sel = ctx->temp_reg;
3344		alu.src[0].chan = 0;
3345		if (i == last_slot - 1)
3346			alu.last = 1;
3347		r = r600_bytecode_add_alu(ctx->bc, &alu);
3348		if (r)
3349			return r;
3350	}
3351	return 0;
3352}
3353
3354static int tgsi_trig(struct r600_shader_ctx *ctx)
3355{
3356	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3357	struct r600_bytecode_alu alu;
3358	int i, r;
3359	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3360
3361	r = tgsi_setup_trig(ctx);
3362	if (r)
3363		return r;
3364
3365	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3366	alu.op = ctx->inst_info->op;
3367	alu.dst.chan = 0;
3368	alu.dst.sel = ctx->temp_reg;
3369	alu.dst.write = 1;
3370
3371	alu.src[0].sel = ctx->temp_reg;
3372	alu.src[0].chan = 0;
3373	alu.last = 1;
3374	r = r600_bytecode_add_alu(ctx->bc, &alu);
3375	if (r)
3376		return r;
3377
3378	/* replicate result */
3379	for (i = 0; i < lasti + 1; i++) {
3380		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3381			continue;
3382
3383		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3384		alu.op = ALU_OP1_MOV;
3385
3386		alu.src[0].sel = ctx->temp_reg;
3387		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3388		if (i == lasti)
3389			alu.last = 1;
3390		r = r600_bytecode_add_alu(ctx->bc, &alu);
3391		if (r)
3392			return r;
3393	}
3394	return 0;
3395}
3396
3397static int tgsi_scs(struct r600_shader_ctx *ctx)
3398{
3399	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3400	struct r600_bytecode_alu alu;
3401	int i, r;
3402
3403	/* We'll only need the trig stuff if we are going to write to the
3404	 * X or Y components of the destination vector.
3405	 */
3406	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3407		r = tgsi_setup_trig(ctx);
3408		if (r)
3409			return r;
3410	}
3411
3412	/* dst.x = COS */
3413	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3414		if (ctx->bc->chip_class == CAYMAN) {
3415			for (i = 0 ; i < 3; i++) {
3416				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3417				alu.op = ALU_OP1_COS;
3418				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3419
3420				if (i == 0)
3421					alu.dst.write = 1;
3422				else
3423					alu.dst.write = 0;
3424				alu.src[0].sel = ctx->temp_reg;
3425				alu.src[0].chan = 0;
3426				if (i == 2)
3427					alu.last = 1;
3428				r = r600_bytecode_add_alu(ctx->bc, &alu);
3429				if (r)
3430					return r;
3431			}
3432		} else {
3433			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3434			alu.op = ALU_OP1_COS;
3435			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3436
3437			alu.src[0].sel = ctx->temp_reg;
3438			alu.src[0].chan = 0;
3439			alu.last = 1;
3440			r = r600_bytecode_add_alu(ctx->bc, &alu);
3441			if (r)
3442				return r;
3443		}
3444	}
3445
3446	/* dst.y = SIN */
3447	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3448		if (ctx->bc->chip_class == CAYMAN) {
3449			for (i = 0 ; i < 3; i++) {
3450				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3451				alu.op = ALU_OP1_SIN;
3452				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3453				if (i == 1)
3454					alu.dst.write = 1;
3455				else
3456					alu.dst.write = 0;
3457				alu.src[0].sel = ctx->temp_reg;
3458				alu.src[0].chan = 0;
3459				if (i == 2)
3460					alu.last = 1;
3461				r = r600_bytecode_add_alu(ctx->bc, &alu);
3462				if (r)
3463					return r;
3464			}
3465		} else {
3466			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3467			alu.op = ALU_OP1_SIN;
3468			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3469
3470			alu.src[0].sel = ctx->temp_reg;
3471			alu.src[0].chan = 0;
3472			alu.last = 1;
3473			r = r600_bytecode_add_alu(ctx->bc, &alu);
3474			if (r)
3475				return r;
3476		}
3477	}
3478
3479	/* dst.z = 0.0; */
3480	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3481		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3482
3483		alu.op = ALU_OP1_MOV;
3484
3485		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3486
3487		alu.src[0].sel = V_SQ_ALU_SRC_0;
3488		alu.src[0].chan = 0;
3489
3490		alu.last = 1;
3491
3492		r = r600_bytecode_add_alu(ctx->bc, &alu);
3493		if (r)
3494			return r;
3495	}
3496
3497	/* dst.w = 1.0; */
3498	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3499		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3500
3501		alu.op = ALU_OP1_MOV;
3502
3503		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3504
3505		alu.src[0].sel = V_SQ_ALU_SRC_1;
3506		alu.src[0].chan = 0;
3507
3508		alu.last = 1;
3509
3510		r = r600_bytecode_add_alu(ctx->bc, &alu);
3511		if (r)
3512			return r;
3513	}
3514
3515	return 0;
3516}
3517
3518static int tgsi_kill(struct r600_shader_ctx *ctx)
3519{
3520	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3521	struct r600_bytecode_alu alu;
3522	int i, r;
3523
3524	for (i = 0; i < 4; i++) {
3525		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3526		alu.op = ctx->inst_info->op;
3527
3528		alu.dst.chan = i;
3529
3530		alu.src[0].sel = V_SQ_ALU_SRC_0;
3531
3532		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3533			alu.src[1].sel = V_SQ_ALU_SRC_1;
3534			alu.src[1].neg = 1;
3535		} else {
3536			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3537		}
3538		if (i == 3) {
3539			alu.last = 1;
3540		}
3541		r = r600_bytecode_add_alu(ctx->bc, &alu);
3542		if (r)
3543			return r;
3544	}
3545
3546	/* kill must be last in ALU */
3547	ctx->bc->force_add_cf = 1;
3548	ctx->shader->uses_kill = TRUE;
3549	return 0;
3550}
3551
3552static int tgsi_lit(struct r600_shader_ctx *ctx)
3553{
3554	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3555	struct r600_bytecode_alu alu;
3556	int r;
3557
3558	/* tmp.x = max(src.y, 0.0) */
3559	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3560	alu.op = ALU_OP2_MAX;
3561	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3562	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
3563	alu.src[1].chan = 1;
3564
3565	alu.dst.sel = ctx->temp_reg;
3566	alu.dst.chan = 0;
3567	alu.dst.write = 1;
3568
3569	alu.last = 1;
3570	r = r600_bytecode_add_alu(ctx->bc, &alu);
3571	if (r)
3572		return r;
3573
3574	if (inst->Dst[0].Register.WriteMask & (1 << 2))
3575	{
3576		int chan;
3577		int sel;
3578		int i;
3579
3580		if (ctx->bc->chip_class == CAYMAN) {
3581			for (i = 0; i < 3; i++) {
3582				/* tmp.z = log(tmp.x) */
3583				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3584				alu.op = ALU_OP1_LOG_CLAMPED;
3585				alu.src[0].sel = ctx->temp_reg;
3586				alu.src[0].chan = 0;
3587				alu.dst.sel = ctx->temp_reg;
3588				alu.dst.chan = i;
3589				if (i == 2) {
3590					alu.dst.write = 1;
3591					alu.last = 1;
3592				} else
3593					alu.dst.write = 0;
3594
3595				r = r600_bytecode_add_alu(ctx->bc, &alu);
3596				if (r)
3597					return r;
3598			}
3599		} else {
3600			/* tmp.z = log(tmp.x) */
3601			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3602			alu.op = ALU_OP1_LOG_CLAMPED;
3603			alu.src[0].sel = ctx->temp_reg;
3604			alu.src[0].chan = 0;
3605			alu.dst.sel = ctx->temp_reg;
3606			alu.dst.chan = 2;
3607			alu.dst.write = 1;
3608			alu.last = 1;
3609			r = r600_bytecode_add_alu(ctx->bc, &alu);
3610			if (r)
3611				return r;
3612		}
3613
3614		chan = alu.dst.chan;
3615		sel = alu.dst.sel;
3616
3617		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3618		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3619		alu.op = ALU_OP3_MUL_LIT;
3620		alu.src[0].sel  = sel;
3621		alu.src[0].chan = chan;
3622		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3623		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3624		alu.dst.sel = ctx->temp_reg;
3625		alu.dst.chan = 0;
3626		alu.dst.write = 1;
3627		alu.is_op3 = 1;
3628		alu.last = 1;
3629		r = r600_bytecode_add_alu(ctx->bc, &alu);
3630		if (r)
3631			return r;
3632
3633		if (ctx->bc->chip_class == CAYMAN) {
3634			for (i = 0; i < 3; i++) {
3635				/* dst.z = exp(tmp.x) */
3636				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3637				alu.op = ALU_OP1_EXP_IEEE;
3638				alu.src[0].sel = ctx->temp_reg;
3639				alu.src[0].chan = 0;
3640				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3641				if (i == 2) {
3642					alu.dst.write = 1;
3643					alu.last = 1;
3644				} else
3645					alu.dst.write = 0;
3646				r = r600_bytecode_add_alu(ctx->bc, &alu);
3647				if (r)
3648					return r;
3649			}
3650		} else {
3651			/* dst.z = exp(tmp.x) */
3652			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3653			alu.op = ALU_OP1_EXP_IEEE;
3654			alu.src[0].sel = ctx->temp_reg;
3655			alu.src[0].chan = 0;
3656			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3657			alu.last = 1;
3658			r = r600_bytecode_add_alu(ctx->bc, &alu);
3659			if (r)
3660				return r;
3661		}
3662	}
3663
3664	/* dst.x, <- 1.0  */
3665	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3666	alu.op = ALU_OP1_MOV;
3667	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
3668	alu.src[0].chan = 0;
3669	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3670	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3671	r = r600_bytecode_add_alu(ctx->bc, &alu);
3672	if (r)
3673		return r;
3674
3675	/* dst.y = max(src.x, 0.0) */
3676	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3677	alu.op = ALU_OP2_MAX;
3678	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3679	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
3680	alu.src[1].chan = 0;
3681	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3682	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3683	r = r600_bytecode_add_alu(ctx->bc, &alu);
3684	if (r)
3685		return r;
3686
3687	/* dst.w, <- 1.0  */
3688	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3689	alu.op = ALU_OP1_MOV;
3690	alu.src[0].sel  = V_SQ_ALU_SRC_1;
3691	alu.src[0].chan = 0;
3692	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3693	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3694	alu.last = 1;
3695	r = r600_bytecode_add_alu(ctx->bc, &alu);
3696	if (r)
3697		return r;
3698
3699	return 0;
3700}
3701
3702static int tgsi_rsq(struct r600_shader_ctx *ctx)
3703{
3704	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3705	struct r600_bytecode_alu alu;
3706	int i, r;
3707
3708	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3709
3710	/* XXX:
3711	 * For state trackers other than OpenGL, we'll want to use
3712	 * _RECIPSQRT_IEEE instead.
3713	 */
3714	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3715
3716	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3717		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3718		r600_bytecode_src_set_abs(&alu.src[i]);
3719	}
3720	alu.dst.sel = ctx->temp_reg;
3721	alu.dst.write = 1;
3722	alu.last = 1;
3723	r = r600_bytecode_add_alu(ctx->bc, &alu);
3724	if (r)
3725		return r;
3726	/* replicate result */
3727	return tgsi_helper_tempx_replicate(ctx);
3728}
3729
3730static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3731{
3732	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3733	struct r600_bytecode_alu alu;
3734	int i, r;
3735
3736	for (i = 0; i < 4; i++) {
3737		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3738		alu.src[0].sel = ctx->temp_reg;
3739		alu.op = ALU_OP1_MOV;
3740		alu.dst.chan = i;
3741		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3742		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3743		if (i == 3)
3744			alu.last = 1;
3745		r = r600_bytecode_add_alu(ctx->bc, &alu);
3746		if (r)
3747			return r;
3748	}
3749	return 0;
3750}
3751
3752static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3753{
3754	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3755	struct r600_bytecode_alu alu;
3756	int i, r;
3757
3758	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3759	alu.op = ctx->inst_info->op;
3760	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3761		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3762	}
3763	alu.dst.sel = ctx->temp_reg;
3764	alu.dst.write = 1;
3765	alu.last = 1;
3766	r = r600_bytecode_add_alu(ctx->bc, &alu);
3767	if (r)
3768		return r;
3769	/* replicate result */
3770	return tgsi_helper_tempx_replicate(ctx);
3771}
3772
3773static int cayman_pow(struct r600_shader_ctx *ctx)
3774{
3775	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3776	int i, r;
3777	struct r600_bytecode_alu alu;
3778	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3779
3780	for (i = 0; i < 3; i++) {
3781		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3782		alu.op = ALU_OP1_LOG_IEEE;
3783		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3784		alu.dst.sel = ctx->temp_reg;
3785		alu.dst.chan = i;
3786		alu.dst.write = 1;
3787		if (i == 2)
3788			alu.last = 1;
3789		r = r600_bytecode_add_alu(ctx->bc, &alu);
3790		if (r)
3791			return r;
3792	}
3793
3794	/* b * LOG2(a) */
3795	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3796	alu.op = ALU_OP2_MUL;
3797	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3798	alu.src[1].sel = ctx->temp_reg;
3799	alu.dst.sel = ctx->temp_reg;
3800	alu.dst.write = 1;
3801	alu.last = 1;
3802	r = r600_bytecode_add_alu(ctx->bc, &alu);
3803	if (r)
3804		return r;
3805
3806	for (i = 0; i < last_slot; i++) {
3807		/* POW(a,b) = EXP2(b * LOG2(a))*/
3808		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3809		alu.op = ALU_OP1_EXP_IEEE;
3810		alu.src[0].sel = ctx->temp_reg;
3811
3812		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3813		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3814		if (i == last_slot - 1)
3815			alu.last = 1;
3816		r = r600_bytecode_add_alu(ctx->bc, &alu);
3817		if (r)
3818			return r;
3819	}
3820	return 0;
3821}
3822
3823static int tgsi_pow(struct r600_shader_ctx *ctx)
3824{
3825	struct r600_bytecode_alu alu;
3826	int r;
3827
3828	/* LOG2(a) */
3829	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3830	alu.op = ALU_OP1_LOG_IEEE;
3831	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3832	alu.dst.sel = ctx->temp_reg;
3833	alu.dst.write = 1;
3834	alu.last = 1;
3835	r = r600_bytecode_add_alu(ctx->bc, &alu);
3836	if (r)
3837		return r;
3838	/* b * LOG2(a) */
3839	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3840	alu.op = ALU_OP2_MUL;
3841	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3842	alu.src[1].sel = ctx->temp_reg;
3843	alu.dst.sel = ctx->temp_reg;
3844	alu.dst.write = 1;
3845	alu.last = 1;
3846	r = r600_bytecode_add_alu(ctx->bc, &alu);
3847	if (r)
3848		return r;
3849	/* POW(a,b) = EXP2(b * LOG2(a))*/
3850	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3851	alu.op = ALU_OP1_EXP_IEEE;
3852	alu.src[0].sel = ctx->temp_reg;
3853	alu.dst.sel = ctx->temp_reg;
3854	alu.dst.write = 1;
3855	alu.last = 1;
3856	r = r600_bytecode_add_alu(ctx->bc, &alu);
3857	if (r)
3858		return r;
3859	return tgsi_helper_tempx_replicate(ctx);
3860}
3861
3862static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3863{
3864	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3865	struct r600_bytecode_alu alu;
3866	int i, r, j;
3867	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3868	int tmp0 = ctx->temp_reg;
3869	int tmp1 = r600_get_temp(ctx);
3870	int tmp2 = r600_get_temp(ctx);
3871	int tmp3 = r600_get_temp(ctx);
3872	/* Unsigned path:
3873	 *
3874	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3875	 *
3876	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
3877	 * 2. tmp0.z = lo (tmp0.x * src2)
3878	 * 3. tmp0.w = -tmp0.z
3879	 * 4. tmp0.y = hi (tmp0.x * src2)
3880	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
3881	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
3882	 * 7. tmp1.x = tmp0.x - tmp0.w
3883	 * 8. tmp1.y = tmp0.x + tmp0.w
3884	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3885	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
3886	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
3887	 *
3888	 * 12. tmp0.w = src1 - tmp0.y       = r
3889	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
3890	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
3891	 *
3892	 * if DIV
3893	 *
3894	 *   15. tmp1.z = tmp0.z + 1			= q + 1
3895	 *   16. tmp1.w = tmp0.z - 1			= q - 1
3896	 *
3897	 * else MOD
3898	 *
3899	 *   15. tmp1.z = tmp0.w - src2			= r - src2
3900	 *   16. tmp1.w = tmp0.w + src2			= r + src2
3901	 *
3902	 * endif
3903	 *
3904	 * 17. tmp1.x = tmp1.x & tmp1.y
3905	 *
3906	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
3907	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
3908	 *
3909	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
3910	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
3911	 *
3912	 * Signed path:
3913	 *
3914	 * Same as unsigned, using abs values of the operands,
3915	 * and fixing the sign of the result in the end.
3916	 */
3917
3918	for (i = 0; i < 4; i++) {
3919		if (!(write_mask & (1<<i)))
3920			continue;
3921
3922		if (signed_op) {
3923
3924			/* tmp2.x = -src0 */
3925			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3926			alu.op = ALU_OP2_SUB_INT;
3927
3928			alu.dst.sel = tmp2;
3929			alu.dst.chan = 0;
3930			alu.dst.write = 1;
3931
3932			alu.src[0].sel = V_SQ_ALU_SRC_0;
3933
3934			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3935
3936			alu.last = 1;
3937			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3938				return r;
3939
3940			/* tmp2.y = -src1 */
3941			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3942			alu.op = ALU_OP2_SUB_INT;
3943
3944			alu.dst.sel = tmp2;
3945			alu.dst.chan = 1;
3946			alu.dst.write = 1;
3947
3948			alu.src[0].sel = V_SQ_ALU_SRC_0;
3949
3950			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3951
3952			alu.last = 1;
3953			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3954				return r;
3955
3956			/* tmp2.z sign bit is set if src0 and src2 signs are different */
3957			/* it will be a sign of the quotient */
3958			if (!mod) {
3959
3960				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3961				alu.op = ALU_OP2_XOR_INT;
3962
3963				alu.dst.sel = tmp2;
3964				alu.dst.chan = 2;
3965				alu.dst.write = 1;
3966
3967				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3968				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3969
3970				alu.last = 1;
3971				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3972					return r;
3973			}
3974
3975			/* tmp2.x = |src0| */
3976			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3977			alu.op = ALU_OP3_CNDGE_INT;
3978			alu.is_op3 = 1;
3979
3980			alu.dst.sel = tmp2;
3981			alu.dst.chan = 0;
3982			alu.dst.write = 1;
3983
3984			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3985			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3986			alu.src[2].sel = tmp2;
3987			alu.src[2].chan = 0;
3988
3989			alu.last = 1;
3990			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3991				return r;
3992
3993			/* tmp2.y = |src1| */
3994			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3995			alu.op = ALU_OP3_CNDGE_INT;
3996			alu.is_op3 = 1;
3997
3998			alu.dst.sel = tmp2;
3999			alu.dst.chan = 1;
4000			alu.dst.write = 1;
4001
4002			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4003			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4004			alu.src[2].sel = tmp2;
4005			alu.src[2].chan = 1;
4006
4007			alu.last = 1;
4008			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4009				return r;
4010
4011		}
4012
4013		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
4014		if (ctx->bc->chip_class == CAYMAN) {
4015			/* tmp3.x = u2f(src2) */
4016			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4017			alu.op = ALU_OP1_UINT_TO_FLT;
4018
4019			alu.dst.sel = tmp3;
4020			alu.dst.chan = 0;
4021			alu.dst.write = 1;
4022
4023			if (signed_op) {
4024				alu.src[0].sel = tmp2;
4025				alu.src[0].chan = 1;
4026			} else {
4027				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4028			}
4029
4030			alu.last = 1;
4031			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4032				return r;
4033
4034			/* tmp0.x = recip(tmp3.x) */
4035			for (j = 0 ; j < 3; j++) {
4036				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4037				alu.op = ALU_OP1_RECIP_IEEE;
4038
4039				alu.dst.sel = tmp0;
4040				alu.dst.chan = j;
4041				alu.dst.write = (j == 0);
4042
4043				alu.src[0].sel = tmp3;
4044				alu.src[0].chan = 0;
4045
4046				if (j == 2)
4047					alu.last = 1;
4048				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4049					return r;
4050			}
4051
4052			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4053			alu.op = ALU_OP2_MUL;
4054
4055			alu.src[0].sel = tmp0;
4056			alu.src[0].chan = 0;
4057
4058			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4059			alu.src[1].value = 0x4f800000;
4060
4061			alu.dst.sel = tmp3;
4062			alu.dst.write = 1;
4063			alu.last = 1;
4064			r = r600_bytecode_add_alu(ctx->bc, &alu);
4065			if (r)
4066				return r;
4067
4068			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4069			alu.op = ALU_OP1_FLT_TO_UINT;
4070
4071			alu.dst.sel = tmp0;
4072			alu.dst.chan = 0;
4073			alu.dst.write = 1;
4074
4075			alu.src[0].sel = tmp3;
4076			alu.src[0].chan = 0;
4077
4078			alu.last = 1;
4079			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4080				return r;
4081
4082		} else {
4083			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4084			alu.op = ALU_OP1_RECIP_UINT;
4085
4086			alu.dst.sel = tmp0;
4087			alu.dst.chan = 0;
4088			alu.dst.write = 1;
4089
4090			if (signed_op) {
4091				alu.src[0].sel = tmp2;
4092				alu.src[0].chan = 1;
4093			} else {
4094				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4095			}
4096
4097			alu.last = 1;
4098			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4099				return r;
4100		}
4101
4102		/* 2. tmp0.z = lo (tmp0.x * src2) */
4103		if (ctx->bc->chip_class == CAYMAN) {
4104			for (j = 0 ; j < 4; j++) {
4105				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4106				alu.op = ALU_OP2_MULLO_UINT;
4107
4108				alu.dst.sel = tmp0;
4109				alu.dst.chan = j;
4110				alu.dst.write = (j == 2);
4111
4112				alu.src[0].sel = tmp0;
4113				alu.src[0].chan = 0;
4114				if (signed_op) {
4115					alu.src[1].sel = tmp2;
4116					alu.src[1].chan = 1;
4117				} else {
4118					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4119				}
4120
4121				alu.last = (j == 3);
4122				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4123					return r;
4124			}
4125		} else {
4126			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4127			alu.op = ALU_OP2_MULLO_UINT;
4128
4129			alu.dst.sel = tmp0;
4130			alu.dst.chan = 2;
4131			alu.dst.write = 1;
4132
4133			alu.src[0].sel = tmp0;
4134			alu.src[0].chan = 0;
4135			if (signed_op) {
4136				alu.src[1].sel = tmp2;
4137				alu.src[1].chan = 1;
4138			} else {
4139				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4140			}
4141
4142			alu.last = 1;
4143			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4144				return r;
4145		}
4146
4147		/* 3. tmp0.w = -tmp0.z */
4148		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4149		alu.op = ALU_OP2_SUB_INT;
4150
4151		alu.dst.sel = tmp0;
4152		alu.dst.chan = 3;
4153		alu.dst.write = 1;
4154
4155		alu.src[0].sel = V_SQ_ALU_SRC_0;
4156		alu.src[1].sel = tmp0;
4157		alu.src[1].chan = 2;
4158
4159		alu.last = 1;
4160		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4161			return r;
4162
4163		/* 4. tmp0.y = hi (tmp0.x * src2) */
4164		if (ctx->bc->chip_class == CAYMAN) {
4165			for (j = 0 ; j < 4; j++) {
4166				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4167				alu.op = ALU_OP2_MULHI_UINT;
4168
4169				alu.dst.sel = tmp0;
4170				alu.dst.chan = j;
4171				alu.dst.write = (j == 1);
4172
4173				alu.src[0].sel = tmp0;
4174				alu.src[0].chan = 0;
4175
4176				if (signed_op) {
4177					alu.src[1].sel = tmp2;
4178					alu.src[1].chan = 1;
4179				} else {
4180					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4181				}
4182				alu.last = (j == 3);
4183				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4184					return r;
4185			}
4186		} else {
4187			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4188			alu.op = ALU_OP2_MULHI_UINT;
4189
4190			alu.dst.sel = tmp0;
4191			alu.dst.chan = 1;
4192			alu.dst.write = 1;
4193
4194			alu.src[0].sel = tmp0;
4195			alu.src[0].chan = 0;
4196
4197			if (signed_op) {
4198				alu.src[1].sel = tmp2;
4199				alu.src[1].chan = 1;
4200			} else {
4201				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4202			}
4203
4204			alu.last = 1;
4205			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4206				return r;
4207		}
4208
4209		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
4210		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4211		alu.op = ALU_OP3_CNDE_INT;
4212		alu.is_op3 = 1;
4213
4214		alu.dst.sel = tmp0;
4215		alu.dst.chan = 2;
4216		alu.dst.write = 1;
4217
4218		alu.src[0].sel = tmp0;
4219		alu.src[0].chan = 1;
4220		alu.src[1].sel = tmp0;
4221		alu.src[1].chan = 3;
4222		alu.src[2].sel = tmp0;
4223		alu.src[2].chan = 2;
4224
4225		alu.last = 1;
4226		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4227			return r;
4228
4229		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
4230		if (ctx->bc->chip_class == CAYMAN) {
4231			for (j = 0 ; j < 4; j++) {
4232				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4233				alu.op = ALU_OP2_MULHI_UINT;
4234
4235				alu.dst.sel = tmp0;
4236				alu.dst.chan = j;
4237				alu.dst.write = (j == 3);
4238
4239				alu.src[0].sel = tmp0;
4240				alu.src[0].chan = 2;
4241
4242				alu.src[1].sel = tmp0;
4243				alu.src[1].chan = 0;
4244
4245				alu.last = (j == 3);
4246				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4247					return r;
4248			}
4249		} else {
4250			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4251			alu.op = ALU_OP2_MULHI_UINT;
4252
4253			alu.dst.sel = tmp0;
4254			alu.dst.chan = 3;
4255			alu.dst.write = 1;
4256
4257			alu.src[0].sel = tmp0;
4258			alu.src[0].chan = 2;
4259
4260			alu.src[1].sel = tmp0;
4261			alu.src[1].chan = 0;
4262
4263			alu.last = 1;
4264			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4265				return r;
4266		}
4267
4268		/* 7. tmp1.x = tmp0.x - tmp0.w */
4269		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4270		alu.op = ALU_OP2_SUB_INT;
4271
4272		alu.dst.sel = tmp1;
4273		alu.dst.chan = 0;
4274		alu.dst.write = 1;
4275
4276		alu.src[0].sel = tmp0;
4277		alu.src[0].chan = 0;
4278		alu.src[1].sel = tmp0;
4279		alu.src[1].chan = 3;
4280
4281		alu.last = 1;
4282		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4283			return r;
4284
4285		/* 8. tmp1.y = tmp0.x + tmp0.w */
4286		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4287		alu.op = ALU_OP2_ADD_INT;
4288
4289		alu.dst.sel = tmp1;
4290		alu.dst.chan = 1;
4291		alu.dst.write = 1;
4292
4293		alu.src[0].sel = tmp0;
4294		alu.src[0].chan = 0;
4295		alu.src[1].sel = tmp0;
4296		alu.src[1].chan = 3;
4297
4298		alu.last = 1;
4299		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4300			return r;
4301
4302		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4303		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4304		alu.op = ALU_OP3_CNDE_INT;
4305		alu.is_op3 = 1;
4306
4307		alu.dst.sel = tmp0;
4308		alu.dst.chan = 0;
4309		alu.dst.write = 1;
4310
4311		alu.src[0].sel = tmp0;
4312		alu.src[0].chan = 1;
4313		alu.src[1].sel = tmp1;
4314		alu.src[1].chan = 1;
4315		alu.src[2].sel = tmp1;
4316		alu.src[2].chan = 0;
4317
4318		alu.last = 1;
4319		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4320			return r;
4321
4322		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
4323		if (ctx->bc->chip_class == CAYMAN) {
4324			for (j = 0 ; j < 4; j++) {
4325				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4326				alu.op = ALU_OP2_MULHI_UINT;
4327
4328				alu.dst.sel = tmp0;
4329				alu.dst.chan = j;
4330				alu.dst.write = (j == 2);
4331
4332				alu.src[0].sel = tmp0;
4333				alu.src[0].chan = 0;
4334
4335				if (signed_op) {
4336					alu.src[1].sel = tmp2;
4337					alu.src[1].chan = 0;
4338				} else {
4339					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4340				}
4341
4342				alu.last = (j == 3);
4343				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4344					return r;
4345			}
4346		} else {
4347			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4348			alu.op = ALU_OP2_MULHI_UINT;
4349
4350			alu.dst.sel = tmp0;
4351			alu.dst.chan = 2;
4352			alu.dst.write = 1;
4353
4354			alu.src[0].sel = tmp0;
4355			alu.src[0].chan = 0;
4356
4357			if (signed_op) {
4358				alu.src[1].sel = tmp2;
4359				alu.src[1].chan = 0;
4360			} else {
4361				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4362			}
4363
4364			alu.last = 1;
4365			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4366				return r;
4367		}
4368
4369		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
4370		if (ctx->bc->chip_class == CAYMAN) {
4371			for (j = 0 ; j < 4; j++) {
4372				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4373				alu.op = ALU_OP2_MULLO_UINT;
4374
4375				alu.dst.sel = tmp0;
4376				alu.dst.chan = j;
4377				alu.dst.write = (j == 1);
4378
4379				if (signed_op) {
4380					alu.src[0].sel = tmp2;
4381					alu.src[0].chan = 1;
4382				} else {
4383					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4384				}
4385
4386				alu.src[1].sel = tmp0;
4387				alu.src[1].chan = 2;
4388
4389				alu.last = (j == 3);
4390				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4391					return r;
4392			}
4393		} else {
4394			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4395			alu.op = ALU_OP2_MULLO_UINT;
4396
4397			alu.dst.sel = tmp0;
4398			alu.dst.chan = 1;
4399			alu.dst.write = 1;
4400
4401			if (signed_op) {
4402				alu.src[0].sel = tmp2;
4403				alu.src[0].chan = 1;
4404			} else {
4405				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4406			}
4407
4408			alu.src[1].sel = tmp0;
4409			alu.src[1].chan = 2;
4410
4411			alu.last = 1;
4412			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4413				return r;
4414		}
4415
4416		/* 12. tmp0.w = src1 - tmp0.y       = r */
4417		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4418		alu.op = ALU_OP2_SUB_INT;
4419
4420		alu.dst.sel = tmp0;
4421		alu.dst.chan = 3;
4422		alu.dst.write = 1;
4423
4424		if (signed_op) {
4425			alu.src[0].sel = tmp2;
4426			alu.src[0].chan = 0;
4427		} else {
4428			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4429		}
4430
4431		alu.src[1].sel = tmp0;
4432		alu.src[1].chan = 1;
4433
4434		alu.last = 1;
4435		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4436			return r;
4437
4438		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
4439		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4440		alu.op = ALU_OP2_SETGE_UINT;
4441
4442		alu.dst.sel = tmp1;
4443		alu.dst.chan = 0;
4444		alu.dst.write = 1;
4445
4446		alu.src[0].sel = tmp0;
4447		alu.src[0].chan = 3;
4448		if (signed_op) {
4449			alu.src[1].sel = tmp2;
4450			alu.src[1].chan = 1;
4451		} else {
4452			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4453		}
4454
4455		alu.last = 1;
4456		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4457			return r;
4458
4459		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
4460		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4461		alu.op = ALU_OP2_SETGE_UINT;
4462
4463		alu.dst.sel = tmp1;
4464		alu.dst.chan = 1;
4465		alu.dst.write = 1;
4466
4467		if (signed_op) {
4468			alu.src[0].sel = tmp2;
4469			alu.src[0].chan = 0;
4470		} else {
4471			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4472		}
4473
4474		alu.src[1].sel = tmp0;
4475		alu.src[1].chan = 1;
4476
4477		alu.last = 1;
4478		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4479			return r;
4480
4481		if (mod) { /* UMOD */
4482
4483			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
4484			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4485			alu.op = ALU_OP2_SUB_INT;
4486
4487			alu.dst.sel = tmp1;
4488			alu.dst.chan = 2;
4489			alu.dst.write = 1;
4490
4491			alu.src[0].sel = tmp0;
4492			alu.src[0].chan = 3;
4493
4494			if (signed_op) {
4495				alu.src[1].sel = tmp2;
4496				alu.src[1].chan = 1;
4497			} else {
4498				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4499			}
4500
4501			alu.last = 1;
4502			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4503				return r;
4504
4505			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
4506			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4507			alu.op = ALU_OP2_ADD_INT;
4508
4509			alu.dst.sel = tmp1;
4510			alu.dst.chan = 3;
4511			alu.dst.write = 1;
4512
4513			alu.src[0].sel = tmp0;
4514			alu.src[0].chan = 3;
4515			if (signed_op) {
4516				alu.src[1].sel = tmp2;
4517				alu.src[1].chan = 1;
4518			} else {
4519				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4520			}
4521
4522			alu.last = 1;
4523			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4524				return r;
4525
4526		} else { /* UDIV */
4527
4528			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
4529			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4530			alu.op = ALU_OP2_ADD_INT;
4531
4532			alu.dst.sel = tmp1;
4533			alu.dst.chan = 2;
4534			alu.dst.write = 1;
4535
4536			alu.src[0].sel = tmp0;
4537			alu.src[0].chan = 2;
4538			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4539
4540			alu.last = 1;
4541			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4542				return r;
4543
4544			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
4545			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4546			alu.op = ALU_OP2_ADD_INT;
4547
4548			alu.dst.sel = tmp1;
4549			alu.dst.chan = 3;
4550			alu.dst.write = 1;
4551
4552			alu.src[0].sel = tmp0;
4553			alu.src[0].chan = 2;
4554			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4555
4556			alu.last = 1;
4557			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4558				return r;
4559
4560		}
4561
4562		/* 17. tmp1.x = tmp1.x & tmp1.y */
4563		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4564		alu.op = ALU_OP2_AND_INT;
4565
4566		alu.dst.sel = tmp1;
4567		alu.dst.chan = 0;
4568		alu.dst.write = 1;
4569
4570		alu.src[0].sel = tmp1;
4571		alu.src[0].chan = 0;
4572		alu.src[1].sel = tmp1;
4573		alu.src[1].chan = 1;
4574
4575		alu.last = 1;
4576		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4577			return r;
4578
4579		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
4580		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
4581		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4582		alu.op = ALU_OP3_CNDE_INT;
4583		alu.is_op3 = 1;
4584
4585		alu.dst.sel = tmp0;
4586		alu.dst.chan = 2;
4587		alu.dst.write = 1;
4588
4589		alu.src[0].sel = tmp1;
4590		alu.src[0].chan = 0;
4591		alu.src[1].sel = tmp0;
4592		alu.src[1].chan = mod ? 3 : 2;
4593		alu.src[2].sel = tmp1;
4594		alu.src[2].chan = 2;
4595
4596		alu.last = 1;
4597		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4598			return r;
4599
4600		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4601		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4602		alu.op = ALU_OP3_CNDE_INT;
4603		alu.is_op3 = 1;
4604
4605		if (signed_op) {
4606			alu.dst.sel = tmp0;
4607			alu.dst.chan = 2;
4608			alu.dst.write = 1;
4609		} else {
4610			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4611		}
4612
4613		alu.src[0].sel = tmp1;
4614		alu.src[0].chan = 1;
4615		alu.src[1].sel = tmp1;
4616		alu.src[1].chan = 3;
4617		alu.src[2].sel = tmp0;
4618		alu.src[2].chan = 2;
4619
4620		alu.last = 1;
4621		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4622			return r;
4623
4624		if (signed_op) {
4625
4626			/* fix the sign of the result */
4627
4628			if (mod) {
4629
4630				/* tmp0.x = -tmp0.z */
4631				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4632				alu.op = ALU_OP2_SUB_INT;
4633
4634				alu.dst.sel = tmp0;
4635				alu.dst.chan = 0;
4636				alu.dst.write = 1;
4637
4638				alu.src[0].sel = V_SQ_ALU_SRC_0;
4639				alu.src[1].sel = tmp0;
4640				alu.src[1].chan = 2;
4641
4642				alu.last = 1;
4643				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4644					return r;
4645
4646				/* sign of the remainder is the same as the sign of src0 */
4647				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4648				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4649				alu.op = ALU_OP3_CNDGE_INT;
4650				alu.is_op3 = 1;
4651
4652				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4653
4654				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4655				alu.src[1].sel = tmp0;
4656				alu.src[1].chan = 2;
4657				alu.src[2].sel = tmp0;
4658				alu.src[2].chan = 0;
4659
4660				alu.last = 1;
4661				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4662					return r;
4663
4664			} else {
4665
4666				/* tmp0.x = -tmp0.z */
4667				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4668				alu.op = ALU_OP2_SUB_INT;
4669
4670				alu.dst.sel = tmp0;
4671				alu.dst.chan = 0;
4672				alu.dst.write = 1;
4673
4674				alu.src[0].sel = V_SQ_ALU_SRC_0;
4675				alu.src[1].sel = tmp0;
4676				alu.src[1].chan = 2;
4677
4678				alu.last = 1;
4679				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4680					return r;
4681
4682				/* fix the quotient sign (same as the sign of src0*src1) */
4683				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4684				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685				alu.op = ALU_OP3_CNDGE_INT;
4686				alu.is_op3 = 1;
4687
4688				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4689
4690				alu.src[0].sel = tmp2;
4691				alu.src[0].chan = 2;
4692				alu.src[1].sel = tmp0;
4693				alu.src[1].chan = 2;
4694				alu.src[2].sel = tmp0;
4695				alu.src[2].chan = 0;
4696
4697				alu.last = 1;
4698				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4699					return r;
4700			}
4701		}
4702	}
4703	return 0;
4704}
4705
4706static int tgsi_udiv(struct r600_shader_ctx *ctx)
4707{
4708	return tgsi_divmod(ctx, 0, 0);
4709}
4710
4711static int tgsi_umod(struct r600_shader_ctx *ctx)
4712{
4713	return tgsi_divmod(ctx, 1, 0);
4714}
4715
4716static int tgsi_idiv(struct r600_shader_ctx *ctx)
4717{
4718	return tgsi_divmod(ctx, 0, 1);
4719}
4720
4721static int tgsi_imod(struct r600_shader_ctx *ctx)
4722{
4723	return tgsi_divmod(ctx, 1, 1);
4724}
4725
4726
4727static int tgsi_f2i(struct r600_shader_ctx *ctx)
4728{
4729	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4730	struct r600_bytecode_alu alu;
4731	int i, r;
4732	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4733	int last_inst = tgsi_last_instruction(write_mask);
4734
4735	for (i = 0; i < 4; i++) {
4736		if (!(write_mask & (1<<i)))
4737			continue;
4738
4739		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4740		alu.op = ALU_OP1_TRUNC;
4741
4742		alu.dst.sel = ctx->temp_reg;
4743		alu.dst.chan = i;
4744		alu.dst.write = 1;
4745
4746		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4747		if (i == last_inst)
4748			alu.last = 1;
4749		r = r600_bytecode_add_alu(ctx->bc, &alu);
4750		if (r)
4751			return r;
4752	}
4753
4754	for (i = 0; i < 4; i++) {
4755		if (!(write_mask & (1<<i)))
4756			continue;
4757
4758		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4759		alu.op = ctx->inst_info->op;
4760
4761		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4762
4763		alu.src[0].sel = ctx->temp_reg;
4764		alu.src[0].chan = i;
4765
4766		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4767			alu.last = 1;
4768		r = r600_bytecode_add_alu(ctx->bc, &alu);
4769		if (r)
4770			return r;
4771	}
4772
4773	return 0;
4774}
4775
4776static int tgsi_iabs(struct r600_shader_ctx *ctx)
4777{
4778	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4779	struct r600_bytecode_alu alu;
4780	int i, r;
4781	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4782	int last_inst = tgsi_last_instruction(write_mask);
4783
4784	/* tmp = -src */
4785	for (i = 0; i < 4; i++) {
4786		if (!(write_mask & (1<<i)))
4787			continue;
4788
4789		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4790		alu.op = ALU_OP2_SUB_INT;
4791
4792		alu.dst.sel = ctx->temp_reg;
4793		alu.dst.chan = i;
4794		alu.dst.write = 1;
4795
4796		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4797		alu.src[0].sel = V_SQ_ALU_SRC_0;
4798
4799		if (i == last_inst)
4800			alu.last = 1;
4801		r = r600_bytecode_add_alu(ctx->bc, &alu);
4802		if (r)
4803			return r;
4804	}
4805
4806	/* dst = (src >= 0 ? src : tmp) */
4807	for (i = 0; i < 4; i++) {
4808		if (!(write_mask & (1<<i)))
4809			continue;
4810
4811		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4812		alu.op = ALU_OP3_CNDGE_INT;
4813		alu.is_op3 = 1;
4814		alu.dst.write = 1;
4815
4816		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4817
4818		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4819		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4820		alu.src[2].sel = ctx->temp_reg;
4821		alu.src[2].chan = i;
4822
4823		if (i == last_inst)
4824			alu.last = 1;
4825		r = r600_bytecode_add_alu(ctx->bc, &alu);
4826		if (r)
4827			return r;
4828	}
4829	return 0;
4830}
4831
4832static int tgsi_issg(struct r600_shader_ctx *ctx)
4833{
4834	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4835	struct r600_bytecode_alu alu;
4836	int i, r;
4837	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4838	int last_inst = tgsi_last_instruction(write_mask);
4839
4840	/* tmp = (src >= 0 ? src : -1) */
4841	for (i = 0; i < 4; i++) {
4842		if (!(write_mask & (1<<i)))
4843			continue;
4844
4845		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4846		alu.op = ALU_OP3_CNDGE_INT;
4847		alu.is_op3 = 1;
4848
4849		alu.dst.sel = ctx->temp_reg;
4850		alu.dst.chan = i;
4851		alu.dst.write = 1;
4852
4853		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4854		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4855		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4856
4857		if (i == last_inst)
4858			alu.last = 1;
4859		r = r600_bytecode_add_alu(ctx->bc, &alu);
4860		if (r)
4861			return r;
4862	}
4863
4864	/* dst = (tmp > 0 ? 1 : tmp) */
4865	for (i = 0; i < 4; i++) {
4866		if (!(write_mask & (1<<i)))
4867			continue;
4868
4869		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4870		alu.op = ALU_OP3_CNDGT_INT;
4871		alu.is_op3 = 1;
4872		alu.dst.write = 1;
4873
4874		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4875
4876		alu.src[0].sel = ctx->temp_reg;
4877		alu.src[0].chan = i;
4878
4879		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4880
4881		alu.src[2].sel = ctx->temp_reg;
4882		alu.src[2].chan = i;
4883
4884		if (i == last_inst)
4885			alu.last = 1;
4886		r = r600_bytecode_add_alu(ctx->bc, &alu);
4887		if (r)
4888			return r;
4889	}
4890	return 0;
4891}
4892
4893
4894
4895static int tgsi_ssg(struct r600_shader_ctx *ctx)
4896{
4897	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4898	struct r600_bytecode_alu alu;
4899	int i, r;
4900
4901	/* tmp = (src > 0 ? 1 : src) */
4902	for (i = 0; i < 4; i++) {
4903		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4904		alu.op = ALU_OP3_CNDGT;
4905		alu.is_op3 = 1;
4906
4907		alu.dst.sel = ctx->temp_reg;
4908		alu.dst.chan = i;
4909
4910		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4911		alu.src[1].sel = V_SQ_ALU_SRC_1;
4912		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4913
4914		if (i == 3)
4915			alu.last = 1;
4916		r = r600_bytecode_add_alu(ctx->bc, &alu);
4917		if (r)
4918			return r;
4919	}
4920
4921	/* dst = (-tmp > 0 ? -1 : tmp) */
4922	for (i = 0; i < 4; i++) {
4923		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4924		alu.op = ALU_OP3_CNDGT;
4925		alu.is_op3 = 1;
4926		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4927
4928		alu.src[0].sel = ctx->temp_reg;
4929		alu.src[0].chan = i;
4930		alu.src[0].neg = 1;
4931
4932		alu.src[1].sel = V_SQ_ALU_SRC_1;
4933		alu.src[1].neg = 1;
4934
4935		alu.src[2].sel = ctx->temp_reg;
4936		alu.src[2].chan = i;
4937
4938		if (i == 3)
4939			alu.last = 1;
4940		r = r600_bytecode_add_alu(ctx->bc, &alu);
4941		if (r)
4942			return r;
4943	}
4944	return 0;
4945}
4946
4947static int tgsi_bfi(struct r600_shader_ctx *ctx)
4948{
4949	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4950	struct r600_bytecode_alu alu;
4951	int i, r, t1, t2;
4952
4953	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4954	int last_inst = tgsi_last_instruction(write_mask);
4955
4956	t1 = ctx->temp_reg;
4957
4958	for (i = 0; i < 4; i++) {
4959		if (!(write_mask & (1<<i)))
4960			continue;
4961
4962		/* create mask tmp */
4963		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4964		alu.op = ALU_OP2_BFM_INT;
4965		alu.dst.sel = t1;
4966		alu.dst.chan = i;
4967		alu.dst.write = 1;
4968		alu.last = i == last_inst;
4969
4970		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
4971		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4972
4973		r = r600_bytecode_add_alu(ctx->bc, &alu);
4974		if (r)
4975			return r;
4976	}
4977
4978	t2 = r600_get_temp(ctx);
4979
4980	for (i = 0; i < 4; i++) {
4981		if (!(write_mask & (1<<i)))
4982			continue;
4983
4984		/* shift insert left */
4985		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4986		alu.op = ALU_OP2_LSHL_INT;
4987		alu.dst.sel = t2;
4988		alu.dst.chan = i;
4989		alu.dst.write = 1;
4990		alu.last = i == last_inst;
4991
4992		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4993		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4994
4995		r = r600_bytecode_add_alu(ctx->bc, &alu);
4996		if (r)
4997			return r;
4998	}
4999
5000	for (i = 0; i < 4; i++) {
5001		if (!(write_mask & (1<<i)))
5002			continue;
5003
5004		/* actual bitfield insert */
5005		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5006		alu.op = ALU_OP3_BFI_INT;
5007		alu.is_op3 = 1;
5008		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5009		alu.dst.chan = i;
5010		alu.dst.write = 1;
5011		alu.last = i == last_inst;
5012
5013		alu.src[0].sel = t1;
5014		alu.src[0].chan = i;
5015		alu.src[1].sel = t2;
5016		alu.src[1].chan = i;
5017		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5018
5019		r = r600_bytecode_add_alu(ctx->bc, &alu);
5020		if (r)
5021			return r;
5022	}
5023
5024	return 0;
5025}
5026
5027static int tgsi_msb(struct r600_shader_ctx *ctx)
5028{
5029	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5030	struct r600_bytecode_alu alu;
5031	int i, r, t1, t2;
5032
5033	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5034	int last_inst = tgsi_last_instruction(write_mask);
5035
5036	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5037		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5038
5039	t1 = ctx->temp_reg;
5040
5041	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5042	for (i = 0; i < 4; i++) {
5043		if (!(write_mask & (1<<i)))
5044			continue;
5045
5046		/* t1 = FFBH_INT / FFBH_UINT */
5047		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5048		alu.op = ctx->inst_info->op;
5049		alu.dst.sel = t1;
5050		alu.dst.chan = i;
5051		alu.dst.write = 1;
5052		alu.last = i == last_inst;
5053
5054		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5055
5056		r = r600_bytecode_add_alu(ctx->bc, &alu);
5057		if (r)
5058			return r;
5059	}
5060
5061	t2 = r600_get_temp(ctx);
5062
5063	for (i = 0; i < 4; i++) {
5064		if (!(write_mask & (1<<i)))
5065			continue;
5066
5067		/* t2 = 31 - t1 */
5068		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5069		alu.op = ALU_OP2_SUB_INT;
5070		alu.dst.sel = t2;
5071		alu.dst.chan = i;
5072		alu.dst.write = 1;
5073		alu.last = i == last_inst;
5074
5075		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5076		alu.src[0].value = 31;
5077		alu.src[1].sel = t1;
5078		alu.src[1].chan = i;
5079
5080		r = r600_bytecode_add_alu(ctx->bc, &alu);
5081		if (r)
5082			return r;
5083	}
5084
5085	for (i = 0; i < 4; i++) {
5086		if (!(write_mask & (1<<i)))
5087			continue;
5088
5089		/* result = t1 >= 0 ? t2 : t1 */
5090		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5091		alu.op = ALU_OP3_CNDGE_INT;
5092		alu.is_op3 = 1;
5093		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5094		alu.dst.chan = i;
5095		alu.dst.write = 1;
5096		alu.last = i == last_inst;
5097
5098		alu.src[0].sel = t1;
5099		alu.src[0].chan = i;
5100		alu.src[1].sel = t2;
5101		alu.src[1].chan = i;
5102		alu.src[2].sel = t1;
5103		alu.src[2].chan = i;
5104
5105		r = r600_bytecode_add_alu(ctx->bc, &alu);
5106		if (r)
5107			return r;
5108	}
5109
5110	return 0;
5111}
5112
5113static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5114{
5115	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5116	struct r600_bytecode_alu alu;
5117	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5118	unsigned location;
5119	int input;
5120
5121	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5122
5123	input = inst->Src[0].Register.Index;
5124
5125	/* Interpolators have been marked for use already by allocate_system_value_inputs */
5126	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5127		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5128		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5129	}
5130	else {
5131		location = TGSI_INTERPOLATE_LOC_CENTROID;
5132	}
5133
5134	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5135	if (k < 0)
5136		k = 0;
5137	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5138	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5139
5140	/* NOTE: currently offset is not perspective correct */
5141	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5142		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5143		int sample_gpr = -1;
5144		int gradientsH, gradientsV;
5145		struct r600_bytecode_tex tex;
5146
5147		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5148			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5149		}
5150
5151		gradientsH = r600_get_temp(ctx);
5152		gradientsV = r600_get_temp(ctx);
5153		for (i = 0; i < 2; i++) {
5154			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5155			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5156			tex.src_gpr = interp_gpr;
5157			tex.src_sel_x = interp_base_chan + 0;
5158			tex.src_sel_y = interp_base_chan + 1;
5159			tex.src_sel_z = 0;
5160			tex.src_sel_w = 0;
5161			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5162			tex.dst_sel_x = 0;
5163			tex.dst_sel_y = 1;
5164			tex.dst_sel_z = 7;
5165			tex.dst_sel_w = 7;
5166			tex.inst_mod = 1; // Use per pixel gradient calculation
5167			tex.sampler_id = 0;
5168			tex.resource_id = tex.sampler_id;
5169			r = r600_bytecode_add_tex(ctx->bc, &tex);
5170			if (r)
5171				return r;
5172		}
5173
5174		for (i = 0; i < 2; i++) {
5175			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5176			alu.op = ALU_OP3_MULADD;
5177			alu.is_op3 = 1;
5178			alu.src[0].sel = gradientsH;
5179			alu.src[0].chan = i;
5180			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5181				alu.src[1].sel = sample_gpr;
5182				alu.src[1].chan = 2;
5183			}
5184			else {
5185				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5186			}
5187			alu.src[2].sel = interp_gpr;
5188			alu.src[2].chan = interp_base_chan + i;
5189			alu.dst.sel = ctx->temp_reg;
5190			alu.dst.chan = i;
5191			alu.last = i == 1;
5192
5193			r = r600_bytecode_add_alu(ctx->bc, &alu);
5194			if (r)
5195				return r;
5196		}
5197
5198		for (i = 0; i < 2; i++) {
5199			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5200			alu.op = ALU_OP3_MULADD;
5201			alu.is_op3 = 1;
5202			alu.src[0].sel = gradientsV;
5203			alu.src[0].chan = i;
5204			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5205				alu.src[1].sel = sample_gpr;
5206				alu.src[1].chan = 3;
5207			}
5208			else {
5209				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5210			}
5211			alu.src[2].sel = ctx->temp_reg;
5212			alu.src[2].chan = i;
5213			alu.dst.sel = ctx->temp_reg;
5214			alu.dst.chan = i;
5215			alu.last = i == 1;
5216
5217			r = r600_bytecode_add_alu(ctx->bc, &alu);
5218			if (r)
5219				return r;
5220		}
5221	}
5222
5223	tmp = r600_get_temp(ctx);
5224	for (i = 0; i < 8; i++) {
5225		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5226		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5227
5228		alu.dst.sel = tmp;
5229		if ((i > 1 && i < 6)) {
5230			alu.dst.write = 1;
5231		}
5232		else {
5233			alu.dst.write = 0;
5234		}
5235		alu.dst.chan = i % 4;
5236
5237		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5238			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5239			alu.src[0].sel = ctx->temp_reg;
5240			alu.src[0].chan = 1 - (i % 2);
5241		} else {
5242			alu.src[0].sel = interp_gpr;
5243			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5244		}
5245		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5246		alu.src[1].chan = 0;
5247
5248		alu.last = i % 4 == 3;
5249		alu.bank_swizzle_force = SQ_ALU_VEC_210;
5250
5251		r = r600_bytecode_add_alu(ctx->bc, &alu);
5252		if (r)
5253			return r;
5254	}
5255
5256	// INTERP can't swizzle dst
5257	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5258	for (i = 0; i <= lasti; i++) {
5259		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5260			continue;
5261
5262		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5263		alu.op = ALU_OP1_MOV;
5264		alu.src[0].sel = tmp;
5265		alu.src[0].chan = ctx->src[0].swizzle[i];
5266		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5267		alu.dst.write = 1;
5268		alu.last = i == lasti;
5269		r = r600_bytecode_add_alu(ctx->bc, &alu);
5270		if (r)
5271			return r;
5272	}
5273
5274	return 0;
5275}
5276
5277
5278static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5279{
5280	struct r600_bytecode_alu alu;
5281	int i, r;
5282
5283	for (i = 0; i < 4; i++) {
5284		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5285		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5286			alu.op = ALU_OP0_NOP;
5287			alu.dst.chan = i;
5288		} else {
5289			alu.op = ALU_OP1_MOV;
5290			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5291			alu.src[0].sel = ctx->temp_reg;
5292			alu.src[0].chan = i;
5293		}
5294		if (i == 3) {
5295			alu.last = 1;
5296		}
5297		r = r600_bytecode_add_alu(ctx->bc, &alu);
5298		if (r)
5299			return r;
5300	}
5301	return 0;
5302}
5303
5304static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5305                                 unsigned temp, int chan,
5306                                 struct r600_bytecode_alu_src *bc_src,
5307                                 const struct r600_shader_src *shader_src)
5308{
5309	struct r600_bytecode_alu alu;
5310	int r;
5311
5312	r600_bytecode_src(bc_src, shader_src, chan);
5313
5314	/* op3 operands don't support abs modifier */
5315	if (bc_src->abs) {
5316		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
5317		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5318		alu.op = ALU_OP1_MOV;
5319		alu.dst.sel = temp;
5320		alu.dst.chan = chan;
5321		alu.dst.write = 1;
5322
5323		alu.src[0] = *bc_src;
5324		alu.last = true; // sufficient?
5325		r = r600_bytecode_add_alu(ctx->bc, &alu);
5326		if (r)
5327			return r;
5328
5329		memset(bc_src, 0, sizeof(*bc_src));
5330		bc_src->sel = temp;
5331		bc_src->chan = chan;
5332	}
5333	return 0;
5334}
5335
5336static int tgsi_op3(struct r600_shader_ctx *ctx)
5337{
5338	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5339	struct r600_bytecode_alu alu;
5340	int i, j, r;
5341	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5342	int temp_regs[4];
5343
5344	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5345		temp_regs[j] = 0;
5346		if (ctx->src[j].abs)
5347			temp_regs[j] = r600_get_temp(ctx);
5348	}
5349	for (i = 0; i < lasti + 1; i++) {
5350		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5351			continue;
5352
5353		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5354		alu.op = ctx->inst_info->op;
5355		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5356			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5357			if (r)
5358				return r;
5359		}
5360
5361		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5362		alu.dst.chan = i;
5363		alu.dst.write = 1;
5364		alu.is_op3 = 1;
5365		if (i == lasti) {
5366			alu.last = 1;
5367		}
5368		r = r600_bytecode_add_alu(ctx->bc, &alu);
5369		if (r)
5370			return r;
5371	}
5372	return 0;
5373}
5374
5375static int tgsi_dp(struct r600_shader_ctx *ctx)
5376{
5377	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5378	struct r600_bytecode_alu alu;
5379	int i, j, r;
5380
5381	for (i = 0; i < 4; i++) {
5382		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5383		alu.op = ctx->inst_info->op;
5384		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5385			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5386		}
5387
5388		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5389		alu.dst.chan = i;
5390		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5391		/* handle some special cases */
5392		switch (inst->Instruction.Opcode) {
5393		case TGSI_OPCODE_DP2:
5394			if (i > 1) {
5395				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5396				alu.src[0].chan = alu.src[1].chan = 0;
5397			}
5398			break;
5399		case TGSI_OPCODE_DP3:
5400			if (i > 2) {
5401				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5402				alu.src[0].chan = alu.src[1].chan = 0;
5403			}
5404			break;
5405		case TGSI_OPCODE_DPH:
5406			if (i == 3) {
5407				alu.src[0].sel = V_SQ_ALU_SRC_1;
5408				alu.src[0].chan = 0;
5409				alu.src[0].neg = 0;
5410			}
5411			break;
5412		default:
5413			break;
5414		}
5415		if (i == 3) {
5416			alu.last = 1;
5417		}
5418		r = r600_bytecode_add_alu(ctx->bc, &alu);
5419		if (r)
5420			return r;
5421	}
5422	return 0;
5423}
5424
5425static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5426						    unsigned index)
5427{
5428	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5429	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5430		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5431		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5432		ctx->src[index].neg || ctx->src[index].abs ||
5433		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5434}
5435
5436static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5437					unsigned index)
5438{
5439	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5440	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5441}
5442
5443static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5444{
5445	struct r600_bytecode_vtx vtx;
5446	struct r600_bytecode_alu alu;
5447	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5448	int src_gpr, r, i;
5449	int id = tgsi_tex_get_src_gpr(ctx, 1);
5450
5451	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5452	if (src_requires_loading) {
5453		for (i = 0; i < 4; i++) {
5454			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5455			alu.op = ALU_OP1_MOV;
5456			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5457			alu.dst.sel = ctx->temp_reg;
5458			alu.dst.chan = i;
5459			if (i == 3)
5460				alu.last = 1;
5461			alu.dst.write = 1;
5462			r = r600_bytecode_add_alu(ctx->bc, &alu);
5463			if (r)
5464				return r;
5465		}
5466		src_gpr = ctx->temp_reg;
5467	}
5468
5469	memset(&vtx, 0, sizeof(vtx));
5470	vtx.op = FETCH_OP_VFETCH;
5471	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5472	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5473	vtx.src_gpr = src_gpr;
5474	vtx.mega_fetch_count = 16;
5475	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5476	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
5477	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
5478	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
5479	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
5480	vtx.use_const_fields = 1;
5481
5482	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5483		return r;
5484
5485	if (ctx->bc->chip_class >= EVERGREEN)
5486		return 0;
5487
5488	for (i = 0; i < 4; i++) {
5489		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5490		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5491			continue;
5492
5493		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5494		alu.op = ALU_OP2_AND_INT;
5495
5496		alu.dst.chan = i;
5497		alu.dst.sel = vtx.dst_gpr;
5498		alu.dst.write = 1;
5499
5500		alu.src[0].sel = vtx.dst_gpr;
5501		alu.src[0].chan = i;
5502
5503		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5504		alu.src[1].sel += (id * 2);
5505		alu.src[1].chan = i % 4;
5506		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5507
5508		if (i == lasti)
5509			alu.last = 1;
5510		r = r600_bytecode_add_alu(ctx->bc, &alu);
5511		if (r)
5512			return r;
5513	}
5514
5515	if (inst->Dst[0].Register.WriteMask & 3) {
5516		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5517		alu.op = ALU_OP2_OR_INT;
5518
5519		alu.dst.chan = 3;
5520		alu.dst.sel = vtx.dst_gpr;
5521		alu.dst.write = 1;
5522
5523		alu.src[0].sel = vtx.dst_gpr;
5524		alu.src[0].chan = 3;
5525
5526		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5527		alu.src[1].chan = 0;
5528		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5529
5530		alu.last = 1;
5531		r = r600_bytecode_add_alu(ctx->bc, &alu);
5532		if (r)
5533			return r;
5534	}
5535	return 0;
5536}
5537
5538static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5539{
5540	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5541	struct r600_bytecode_alu alu;
5542	int r;
5543	int id = tgsi_tex_get_src_gpr(ctx, 1);
5544
5545	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5546	alu.op = ALU_OP1_MOV;
5547	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5548	if (ctx->bc->chip_class >= EVERGREEN) {
5549		/* channel 0 or 2 of each word */
5550		alu.src[0].sel += (id / 2);
5551		alu.src[0].chan = (id % 2) * 2;
5552	} else {
5553		/* r600 we have them at channel 2 of the second dword */
5554		alu.src[0].sel += (id * 2) + 1;
5555		alu.src[0].chan = 1;
5556	}
5557	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5558	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5559	alu.last = 1;
5560	r = r600_bytecode_add_alu(ctx->bc, &alu);
5561	if (r)
5562		return r;
5563	return 0;
5564}
5565
5566static int tgsi_tex(struct r600_shader_ctx *ctx)
5567{
5568	static float one_point_five = 1.5f;
5569	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5570	struct r600_bytecode_tex tex;
5571	struct r600_bytecode_alu alu;
5572	unsigned src_gpr;
5573	int r, i, j;
5574	int opcode;
5575	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5576				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5577				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5578				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5579
5580	bool txf_add_offsets = inst->Texture.NumOffsets &&
5581			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5582			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5583
5584	/* Texture fetch instructions can only use gprs as source.
5585	 * Also they cannot negate the source or take the absolute value */
5586	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5587                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
5588					     read_compressed_msaa || txf_add_offsets;
5589
5590	boolean src_loaded = FALSE;
5591	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5592	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5593	boolean has_txq_cube_array_z = false;
5594	unsigned sampler_index_mode;
5595
5596	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5597	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5598	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5599		if (inst->Dst[0].Register.WriteMask & 4) {
5600			ctx->shader->has_txq_cube_array_z_comp = true;
5601			has_txq_cube_array_z = true;
5602		}
5603
5604	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5605	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5606	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5607	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5608		sampler_src_reg = 2;
5609
5610	/* TGSI moves the sampler to src reg 3 for TXD */
5611	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5612		sampler_src_reg = 3;
5613
5614	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5615	if (sampler_index_mode)
5616		ctx->shader->uses_index_registers = true;
5617
5618	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5619
5620	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5621		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5622			ctx->shader->uses_tex_buffers = true;
5623			return r600_do_buffer_txq(ctx);
5624		}
5625		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5626			if (ctx->bc->chip_class < EVERGREEN)
5627				ctx->shader->uses_tex_buffers = true;
5628			return do_vtx_fetch_inst(ctx, src_requires_loading);
5629		}
5630	}
5631
5632	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5633		int out_chan;
5634		/* Add perspective divide */
5635		if (ctx->bc->chip_class == CAYMAN) {
5636			out_chan = 2;
5637			for (i = 0; i < 3; i++) {
5638				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5639				alu.op = ALU_OP1_RECIP_IEEE;
5640				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5641
5642				alu.dst.sel = ctx->temp_reg;
5643				alu.dst.chan = i;
5644				if (i == 2)
5645					alu.last = 1;
5646				if (out_chan == i)
5647					alu.dst.write = 1;
5648				r = r600_bytecode_add_alu(ctx->bc, &alu);
5649				if (r)
5650					return r;
5651			}
5652
5653		} else {
5654			out_chan = 3;
5655			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5656			alu.op = ALU_OP1_RECIP_IEEE;
5657			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5658
5659			alu.dst.sel = ctx->temp_reg;
5660			alu.dst.chan = out_chan;
5661			alu.last = 1;
5662			alu.dst.write = 1;
5663			r = r600_bytecode_add_alu(ctx->bc, &alu);
5664			if (r)
5665				return r;
5666		}
5667
5668		for (i = 0; i < 3; i++) {
5669			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5670			alu.op = ALU_OP2_MUL;
5671			alu.src[0].sel = ctx->temp_reg;
5672			alu.src[0].chan = out_chan;
5673			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5674			alu.dst.sel = ctx->temp_reg;
5675			alu.dst.chan = i;
5676			alu.dst.write = 1;
5677			r = r600_bytecode_add_alu(ctx->bc, &alu);
5678			if (r)
5679				return r;
5680		}
5681		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5682		alu.op = ALU_OP1_MOV;
5683		alu.src[0].sel = V_SQ_ALU_SRC_1;
5684		alu.src[0].chan = 0;
5685		alu.dst.sel = ctx->temp_reg;
5686		alu.dst.chan = 3;
5687		alu.last = 1;
5688		alu.dst.write = 1;
5689		r = r600_bytecode_add_alu(ctx->bc, &alu);
5690		if (r)
5691			return r;
5692		src_loaded = TRUE;
5693		src_gpr = ctx->temp_reg;
5694	}
5695
5696
5697	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5698	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5699	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5700	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5701	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5702	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5703
5704		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5705		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5706
5707		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5708		for (i = 0; i < 4; i++) {
5709			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5710			alu.op = ALU_OP2_CUBE;
5711			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5712			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5713			alu.dst.sel = ctx->temp_reg;
5714			alu.dst.chan = i;
5715			if (i == 3)
5716				alu.last = 1;
5717			alu.dst.write = 1;
5718			r = r600_bytecode_add_alu(ctx->bc, &alu);
5719			if (r)
5720				return r;
5721		}
5722
5723		/* tmp1.z = RCP_e(|tmp1.z|) */
5724		if (ctx->bc->chip_class == CAYMAN) {
5725			for (i = 0; i < 3; i++) {
5726				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5727				alu.op = ALU_OP1_RECIP_IEEE;
5728				alu.src[0].sel = ctx->temp_reg;
5729				alu.src[0].chan = 2;
5730				alu.src[0].abs = 1;
5731				alu.dst.sel = ctx->temp_reg;
5732				alu.dst.chan = i;
5733				if (i == 2)
5734					alu.dst.write = 1;
5735				if (i == 2)
5736					alu.last = 1;
5737				r = r600_bytecode_add_alu(ctx->bc, &alu);
5738				if (r)
5739					return r;
5740			}
5741		} else {
5742			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5743			alu.op = ALU_OP1_RECIP_IEEE;
5744			alu.src[0].sel = ctx->temp_reg;
5745			alu.src[0].chan = 2;
5746			alu.src[0].abs = 1;
5747			alu.dst.sel = ctx->temp_reg;
5748			alu.dst.chan = 2;
5749			alu.dst.write = 1;
5750			alu.last = 1;
5751			r = r600_bytecode_add_alu(ctx->bc, &alu);
5752			if (r)
5753				return r;
5754		}
5755
5756		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
5757		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
5758		 * muladd has no writemask, have to use another temp
5759		 */
5760		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5761		alu.op = ALU_OP3_MULADD;
5762		alu.is_op3 = 1;
5763
5764		alu.src[0].sel = ctx->temp_reg;
5765		alu.src[0].chan = 0;
5766		alu.src[1].sel = ctx->temp_reg;
5767		alu.src[1].chan = 2;
5768
5769		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5770		alu.src[2].chan = 0;
5771		alu.src[2].value = *(uint32_t *)&one_point_five;
5772
5773		alu.dst.sel = ctx->temp_reg;
5774		alu.dst.chan = 0;
5775		alu.dst.write = 1;
5776
5777		r = r600_bytecode_add_alu(ctx->bc, &alu);
5778		if (r)
5779			return r;
5780
5781		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5782		alu.op = ALU_OP3_MULADD;
5783		alu.is_op3 = 1;
5784
5785		alu.src[0].sel = ctx->temp_reg;
5786		alu.src[0].chan = 1;
5787		alu.src[1].sel = ctx->temp_reg;
5788		alu.src[1].chan = 2;
5789
5790		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5791		alu.src[2].chan = 0;
5792		alu.src[2].value = *(uint32_t *)&one_point_five;
5793
5794		alu.dst.sel = ctx->temp_reg;
5795		alu.dst.chan = 1;
5796		alu.dst.write = 1;
5797
5798		alu.last = 1;
5799		r = r600_bytecode_add_alu(ctx->bc, &alu);
5800		if (r)
5801			return r;
5802		/* write initial compare value into Z component
5803		  - W src 0 for shadow cube
5804		  - X src 1 for shadow cube array */
5805		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5806		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5807			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5808			alu.op = ALU_OP1_MOV;
5809			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5810				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5811			else
5812				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5813			alu.dst.sel = ctx->temp_reg;
5814			alu.dst.chan = 2;
5815			alu.dst.write = 1;
5816			alu.last = 1;
5817			r = r600_bytecode_add_alu(ctx->bc, &alu);
5818			if (r)
5819				return r;
5820		}
5821
5822		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5823		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5824			if (ctx->bc->chip_class >= EVERGREEN) {
5825				int mytmp = r600_get_temp(ctx);
5826				static const float eight = 8.0f;
5827				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5828				alu.op = ALU_OP1_MOV;
5829				alu.src[0].sel = ctx->temp_reg;
5830				alu.src[0].chan = 3;
5831				alu.dst.sel = mytmp;
5832				alu.dst.chan = 0;
5833				alu.dst.write = 1;
5834				alu.last = 1;
5835				r = r600_bytecode_add_alu(ctx->bc, &alu);
5836				if (r)
5837					return r;
5838
5839				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5840				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5841				alu.op = ALU_OP3_MULADD;
5842				alu.is_op3 = 1;
5843				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5844				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5845				alu.src[1].chan = 0;
5846				alu.src[1].value = *(uint32_t *)&eight;
5847				alu.src[2].sel = mytmp;
5848				alu.src[2].chan = 0;
5849				alu.dst.sel = ctx->temp_reg;
5850				alu.dst.chan = 3;
5851				alu.dst.write = 1;
5852				alu.last = 1;
5853				r = r600_bytecode_add_alu(ctx->bc, &alu);
5854				if (r)
5855					return r;
5856			} else if (ctx->bc->chip_class < EVERGREEN) {
5857				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5858				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5859				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5860				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5861				tex.src_gpr = r600_get_temp(ctx);
5862				tex.src_sel_x = 0;
5863				tex.src_sel_y = 0;
5864				tex.src_sel_z = 0;
5865				tex.src_sel_w = 0;
5866				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5867				tex.coord_type_x = 1;
5868				tex.coord_type_y = 1;
5869				tex.coord_type_z = 1;
5870				tex.coord_type_w = 1;
5871				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5872				alu.op = ALU_OP1_MOV;
5873				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5874				alu.dst.sel = tex.src_gpr;
5875				alu.dst.chan = 0;
5876				alu.last = 1;
5877				alu.dst.write = 1;
5878				r = r600_bytecode_add_alu(ctx->bc, &alu);
5879				if (r)
5880					return r;
5881
5882				r = r600_bytecode_add_tex(ctx->bc, &tex);
5883				if (r)
5884					return r;
5885			}
5886
5887		}
5888
5889		/* for cube forms of lod and bias we need to route things */
5890		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
5891		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
5892		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5893		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
5894			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5895			alu.op = ALU_OP1_MOV;
5896			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5897			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
5898				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5899			else
5900				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5901			alu.dst.sel = ctx->temp_reg;
5902			alu.dst.chan = 2;
5903			alu.last = 1;
5904			alu.dst.write = 1;
5905			r = r600_bytecode_add_alu(ctx->bc, &alu);
5906			if (r)
5907				return r;
5908		}
5909
5910		src_loaded = TRUE;
5911		src_gpr = ctx->temp_reg;
5912	}
5913
5914	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
5915		int temp_h = 0, temp_v = 0;
5916		int start_val = 0;
5917
5918		/* if we've already loaded the src (i.e. CUBE don't reload it). */
5919		if (src_loaded == TRUE)
5920			start_val = 1;
5921		else
5922			src_loaded = TRUE;
5923		for (i = start_val; i < 3; i++) {
5924			int treg = r600_get_temp(ctx);
5925
5926			if (i == 0)
5927				src_gpr = treg;
5928			else if (i == 1)
5929				temp_h = treg;
5930			else
5931				temp_v = treg;
5932
5933			for (j = 0; j < 4; j++) {
5934				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5935				alu.op = ALU_OP1_MOV;
5936                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
5937                                alu.dst.sel = treg;
5938                                alu.dst.chan = j;
5939                                if (j == 3)
5940                                   alu.last = 1;
5941                                alu.dst.write = 1;
5942                                r = r600_bytecode_add_alu(ctx->bc, &alu);
5943                                if (r)
5944                                    return r;
5945			}
5946		}
5947		for (i = 1; i < 3; i++) {
5948			/* set gradients h/v */
5949			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5950			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
5951				FETCH_OP_SET_GRADIENTS_V;
5952			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5953			tex.sampler_index_mode = sampler_index_mode;
5954			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5955			tex.resource_index_mode = sampler_index_mode;
5956
5957			tex.src_gpr = (i == 1) ? temp_h : temp_v;
5958			tex.src_sel_x = 0;
5959			tex.src_sel_y = 1;
5960			tex.src_sel_z = 2;
5961			tex.src_sel_w = 3;
5962
5963			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
5964			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5965			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
5966				tex.coord_type_x = 1;
5967				tex.coord_type_y = 1;
5968				tex.coord_type_z = 1;
5969				tex.coord_type_w = 1;
5970			}
5971			r = r600_bytecode_add_tex(ctx->bc, &tex);
5972			if (r)
5973				return r;
5974		}
5975	}
5976
5977	if (src_requires_loading && !src_loaded) {
5978		for (i = 0; i < 4; i++) {
5979			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5980			alu.op = ALU_OP1_MOV;
5981			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5982			alu.dst.sel = ctx->temp_reg;
5983			alu.dst.chan = i;
5984			if (i == 3)
5985				alu.last = 1;
5986			alu.dst.write = 1;
5987			r = r600_bytecode_add_alu(ctx->bc, &alu);
5988			if (r)
5989				return r;
5990		}
5991		src_loaded = TRUE;
5992		src_gpr = ctx->temp_reg;
5993	}
5994
5995	/* get offset values */
5996	if (inst->Texture.NumOffsets) {
5997		assert(inst->Texture.NumOffsets == 1);
5998
5999		/* The texture offset feature doesn't work with the TXF instruction
6000		 * and must be emulated by adding the offset to the texture coordinates. */
6001		if (txf_add_offsets) {
6002			const struct tgsi_texture_offset *off = inst->TexOffsets;
6003
6004			switch (inst->Texture.Texture) {
6005			case TGSI_TEXTURE_3D:
6006				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6007				alu.op = ALU_OP2_ADD_INT;
6008				alu.src[0].sel = src_gpr;
6009				alu.src[0].chan = 2;
6010				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6011				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6012				alu.dst.sel = src_gpr;
6013				alu.dst.chan = 2;
6014				alu.dst.write = 1;
6015				alu.last = 1;
6016				r = r600_bytecode_add_alu(ctx->bc, &alu);
6017				if (r)
6018					return r;
6019				/* fall through */
6020
6021			case TGSI_TEXTURE_2D:
6022			case TGSI_TEXTURE_SHADOW2D:
6023			case TGSI_TEXTURE_RECT:
6024			case TGSI_TEXTURE_SHADOWRECT:
6025			case TGSI_TEXTURE_2D_ARRAY:
6026			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6027				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6028				alu.op = ALU_OP2_ADD_INT;
6029				alu.src[0].sel = src_gpr;
6030				alu.src[0].chan = 1;
6031				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6032				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6033				alu.dst.sel = src_gpr;
6034				alu.dst.chan = 1;
6035				alu.dst.write = 1;
6036				alu.last = 1;
6037				r = r600_bytecode_add_alu(ctx->bc, &alu);
6038				if (r)
6039					return r;
6040				/* fall through */
6041
6042			case TGSI_TEXTURE_1D:
6043			case TGSI_TEXTURE_SHADOW1D:
6044			case TGSI_TEXTURE_1D_ARRAY:
6045			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6046				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6047				alu.op = ALU_OP2_ADD_INT;
6048				alu.src[0].sel = src_gpr;
6049				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6050				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6051				alu.dst.sel = src_gpr;
6052				alu.dst.write = 1;
6053				alu.last = 1;
6054				r = r600_bytecode_add_alu(ctx->bc, &alu);
6055				if (r)
6056					return r;
6057				break;
6058				/* texture offsets do not apply to other texture targets */
6059			}
6060		} else {
6061			switch (inst->Texture.Texture) {
6062			case TGSI_TEXTURE_3D:
6063				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6064				/* fallthrough */
6065			case TGSI_TEXTURE_2D:
6066			case TGSI_TEXTURE_SHADOW2D:
6067			case TGSI_TEXTURE_RECT:
6068			case TGSI_TEXTURE_SHADOWRECT:
6069			case TGSI_TEXTURE_2D_ARRAY:
6070			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6071				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6072				/* fallthrough */
6073			case TGSI_TEXTURE_1D:
6074			case TGSI_TEXTURE_SHADOW1D:
6075			case TGSI_TEXTURE_1D_ARRAY:
6076			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6077				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6078			}
6079		}
6080	}
6081
6082	/* Obtain the sample index for reading a compressed MSAA color texture.
6083	 * To read the FMASK, we use the ldfptr instruction, which tells us
6084	 * where the samples are stored.
6085	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6086	 * which is the identity mapping. Each nibble says which physical sample
6087	 * should be fetched to get that sample.
6088	 *
6089	 * Assume src.z contains the sample index. It should be modified like this:
6090	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6091	 * Then fetch the texel with src.
6092	 */
6093	if (read_compressed_msaa) {
6094		unsigned sample_chan = 3;
6095		unsigned temp = r600_get_temp(ctx);
6096		assert(src_loaded);
6097
6098		/* temp.w = ldfptr() */
6099		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6100		tex.op = FETCH_OP_LD;
6101		tex.inst_mod = 1; /* to indicate this is ldfptr */
6102		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6103		tex.sampler_index_mode = sampler_index_mode;
6104		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6105		tex.resource_index_mode = sampler_index_mode;
6106		tex.src_gpr = src_gpr;
6107		tex.dst_gpr = temp;
6108		tex.dst_sel_x = 7; /* mask out these components */
6109		tex.dst_sel_y = 7;
6110		tex.dst_sel_z = 7;
6111		tex.dst_sel_w = 0; /* store X */
6112		tex.src_sel_x = 0;
6113		tex.src_sel_y = 1;
6114		tex.src_sel_z = 2;
6115		tex.src_sel_w = 3;
6116		tex.offset_x = offset_x;
6117		tex.offset_y = offset_y;
6118		tex.offset_z = offset_z;
6119		r = r600_bytecode_add_tex(ctx->bc, &tex);
6120		if (r)
6121			return r;
6122
6123		/* temp.x = sample_index*4 */
6124		if (ctx->bc->chip_class == CAYMAN) {
6125			for (i = 0 ; i < 4; i++) {
6126				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6127				alu.op = ALU_OP2_MULLO_INT;
6128				alu.src[0].sel = src_gpr;
6129				alu.src[0].chan = sample_chan;
6130				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6131				alu.src[1].value = 4;
6132				alu.dst.sel = temp;
6133				alu.dst.chan = i;
6134				alu.dst.write = i == 0;
6135				if (i == 3)
6136					alu.last = 1;
6137				r = r600_bytecode_add_alu(ctx->bc, &alu);
6138				if (r)
6139					return r;
6140			}
6141		} else {
6142			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6143			alu.op = ALU_OP2_MULLO_INT;
6144			alu.src[0].sel = src_gpr;
6145			alu.src[0].chan = sample_chan;
6146			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6147			alu.src[1].value = 4;
6148			alu.dst.sel = temp;
6149			alu.dst.chan = 0;
6150			alu.dst.write = 1;
6151			alu.last = 1;
6152			r = r600_bytecode_add_alu(ctx->bc, &alu);
6153			if (r)
6154				return r;
6155		}
6156
6157		/* sample_index = temp.w >> temp.x */
6158		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6159		alu.op = ALU_OP2_LSHR_INT;
6160		alu.src[0].sel = temp;
6161		alu.src[0].chan = 3;
6162		alu.src[1].sel = temp;
6163		alu.src[1].chan = 0;
6164		alu.dst.sel = src_gpr;
6165		alu.dst.chan = sample_chan;
6166		alu.dst.write = 1;
6167		alu.last = 1;
6168		r = r600_bytecode_add_alu(ctx->bc, &alu);
6169		if (r)
6170			return r;
6171
6172		/* sample_index & 0xF */
6173		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6174		alu.op = ALU_OP2_AND_INT;
6175		alu.src[0].sel = src_gpr;
6176		alu.src[0].chan = sample_chan;
6177		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6178		alu.src[1].value = 0xF;
6179		alu.dst.sel = src_gpr;
6180		alu.dst.chan = sample_chan;
6181		alu.dst.write = 1;
6182		alu.last = 1;
6183		r = r600_bytecode_add_alu(ctx->bc, &alu);
6184		if (r)
6185			return r;
6186#if 0
6187		/* visualize the FMASK */
6188		for (i = 0; i < 4; i++) {
6189			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6190			alu.op = ALU_OP1_INT_TO_FLT;
6191			alu.src[0].sel = src_gpr;
6192			alu.src[0].chan = sample_chan;
6193			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6194			alu.dst.chan = i;
6195			alu.dst.write = 1;
6196			alu.last = 1;
6197			r = r600_bytecode_add_alu(ctx->bc, &alu);
6198			if (r)
6199				return r;
6200		}
6201		return 0;
6202#endif
6203	}
6204
6205	/* does this shader want a num layers from TXQ for a cube array? */
6206	if (has_txq_cube_array_z) {
6207		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6208
6209		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6210		alu.op = ALU_OP1_MOV;
6211
6212		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6213		if (ctx->bc->chip_class >= EVERGREEN) {
6214			/* channel 1 or 3 of each word */
6215			alu.src[0].sel += (id / 2);
6216			alu.src[0].chan = ((id % 2) * 2) + 1;
6217		} else {
6218			/* r600 we have them at channel 2 of the second dword */
6219			alu.src[0].sel += (id * 2) + 1;
6220			alu.src[0].chan = 2;
6221		}
6222		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6223		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6224		alu.last = 1;
6225		r = r600_bytecode_add_alu(ctx->bc, &alu);
6226		if (r)
6227			return r;
6228		/* disable writemask from texture instruction */
6229		inst->Dst[0].Register.WriteMask &= ~4;
6230	}
6231
6232	opcode = ctx->inst_info->op;
6233	if (opcode == FETCH_OP_GATHER4 &&
6234		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6235		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6236		opcode = FETCH_OP_GATHER4_O;
6237
6238		/* GATHER4_O/GATHER4_C_O use offset values loaded by
6239		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
6240		   encoded in the instruction are ignored. */
6241		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6242		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6243		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6244		tex.sampler_index_mode = sampler_index_mode;
6245		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6246		tex.resource_index_mode = sampler_index_mode;
6247
6248		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6249		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6250		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6251		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6252		tex.src_sel_w = 4;
6253
6254		tex.dst_sel_x = 7;
6255		tex.dst_sel_y = 7;
6256		tex.dst_sel_z = 7;
6257		tex.dst_sel_w = 7;
6258
6259		r = r600_bytecode_add_tex(ctx->bc, &tex);
6260		if (r)
6261			return r;
6262	}
6263
6264	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6265	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6266	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6267	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6268	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6269	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6270	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6271		switch (opcode) {
6272		case FETCH_OP_SAMPLE:
6273			opcode = FETCH_OP_SAMPLE_C;
6274			break;
6275		case FETCH_OP_SAMPLE_L:
6276			opcode = FETCH_OP_SAMPLE_C_L;
6277			break;
6278		case FETCH_OP_SAMPLE_LB:
6279			opcode = FETCH_OP_SAMPLE_C_LB;
6280			break;
6281		case FETCH_OP_SAMPLE_G:
6282			opcode = FETCH_OP_SAMPLE_C_G;
6283			break;
6284		/* Texture gather variants */
6285		case FETCH_OP_GATHER4:
6286			opcode = FETCH_OP_GATHER4_C;
6287			break;
6288		case FETCH_OP_GATHER4_O:
6289			opcode = FETCH_OP_GATHER4_C_O;
6290			break;
6291		}
6292	}
6293
6294	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6295	tex.op = opcode;
6296
6297	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6298	tex.sampler_index_mode = sampler_index_mode;
6299	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6300	tex.resource_index_mode = sampler_index_mode;
6301	tex.src_gpr = src_gpr;
6302	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6303
6304	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6305		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6306		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6307	}
6308
6309	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6310		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6311		tex.inst_mod = texture_component_select;
6312
6313		if (ctx->bc->chip_class == CAYMAN) {
6314		/* GATHER4 result order is different from TGSI TG4 */
6315			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6316			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6317			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6318			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6319		} else {
6320			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6321			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6322			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6323			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6324		}
6325	}
6326	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6327		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6328		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6329		tex.dst_sel_z = 7;
6330		tex.dst_sel_w = 7;
6331	}
6332	else {
6333		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6334		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6335		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6336		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6337	}
6338
6339
6340	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
6341		tex.src_sel_x = 4;
6342		tex.src_sel_y = 4;
6343		tex.src_sel_z = 4;
6344		tex.src_sel_w = 4;
6345	} else if (src_loaded) {
6346		tex.src_sel_x = 0;
6347		tex.src_sel_y = 1;
6348		tex.src_sel_z = 2;
6349		tex.src_sel_w = 3;
6350	} else {
6351		tex.src_sel_x = ctx->src[0].swizzle[0];
6352		tex.src_sel_y = ctx->src[0].swizzle[1];
6353		tex.src_sel_z = ctx->src[0].swizzle[2];
6354		tex.src_sel_w = ctx->src[0].swizzle[3];
6355		tex.src_rel = ctx->src[0].rel;
6356	}
6357
6358	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6359	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6360	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6361	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6362		tex.src_sel_x = 1;
6363		tex.src_sel_y = 0;
6364		tex.src_sel_z = 3;
6365		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6366	}
6367
6368	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6369	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6370		tex.coord_type_x = 1;
6371		tex.coord_type_y = 1;
6372	}
6373	tex.coord_type_z = 1;
6374	tex.coord_type_w = 1;
6375
6376	tex.offset_x = offset_x;
6377	tex.offset_y = offset_y;
6378	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6379		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6380		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6381		tex.offset_z = 0;
6382	}
6383	else {
6384		tex.offset_z = offset_z;
6385	}
6386
6387	/* Put the depth for comparison in W.
6388	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6389	 * Some instructions expect the depth in Z. */
6390	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6391	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6392	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6393	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6394	    opcode != FETCH_OP_SAMPLE_C_L &&
6395	    opcode != FETCH_OP_SAMPLE_C_LB) {
6396		tex.src_sel_w = tex.src_sel_z;
6397	}
6398
6399	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6400	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6401		if (opcode == FETCH_OP_SAMPLE_C_L ||
6402		    opcode == FETCH_OP_SAMPLE_C_LB) {
6403			/* the array index is read from Y */
6404			tex.coord_type_y = 0;
6405		} else {
6406			/* the array index is read from Z */
6407			tex.coord_type_z = 0;
6408			tex.src_sel_z = tex.src_sel_y;
6409		}
6410	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6411		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6412		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6413		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6414		    (ctx->bc->chip_class >= EVERGREEN)))
6415		/* the array index is read from Z */
6416		tex.coord_type_z = 0;
6417
6418	/* mask unused source components */
6419	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6420		switch (inst->Texture.Texture) {
6421		case TGSI_TEXTURE_2D:
6422		case TGSI_TEXTURE_RECT:
6423			tex.src_sel_z = 7;
6424			tex.src_sel_w = 7;
6425			break;
6426		case TGSI_TEXTURE_1D_ARRAY:
6427			tex.src_sel_y = 7;
6428			tex.src_sel_w = 7;
6429			break;
6430		case TGSI_TEXTURE_1D:
6431			tex.src_sel_y = 7;
6432			tex.src_sel_z = 7;
6433			tex.src_sel_w = 7;
6434			break;
6435		}
6436	}
6437
6438	r = r600_bytecode_add_tex(ctx->bc, &tex);
6439	if (r)
6440		return r;
6441
6442	/* add shadow ambient support  - gallium doesn't do it yet */
6443	return 0;
6444}
6445
6446static int tgsi_lrp(struct r600_shader_ctx *ctx)
6447{
6448	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6449	struct r600_bytecode_alu alu;
6450	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6451	unsigned i, temp_regs[2];
6452	int r;
6453
6454	/* optimize if it's just an equal balance */
6455	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6456		for (i = 0; i < lasti + 1; i++) {
6457			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6458				continue;
6459
6460			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6461			alu.op = ALU_OP2_ADD;
6462			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6463			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6464			alu.omod = 3;
6465			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6466			alu.dst.chan = i;
6467			if (i == lasti) {
6468				alu.last = 1;
6469			}
6470			r = r600_bytecode_add_alu(ctx->bc, &alu);
6471			if (r)
6472				return r;
6473		}
6474		return 0;
6475	}
6476
6477	/* 1 - src0 */
6478	for (i = 0; i < lasti + 1; i++) {
6479		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6480			continue;
6481
6482		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6483		alu.op = ALU_OP2_ADD;
6484		alu.src[0].sel = V_SQ_ALU_SRC_1;
6485		alu.src[0].chan = 0;
6486		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6487		r600_bytecode_src_toggle_neg(&alu.src[1]);
6488		alu.dst.sel = ctx->temp_reg;
6489		alu.dst.chan = i;
6490		if (i == lasti) {
6491			alu.last = 1;
6492		}
6493		alu.dst.write = 1;
6494		r = r600_bytecode_add_alu(ctx->bc, &alu);
6495		if (r)
6496			return r;
6497	}
6498
6499	/* (1 - src0) * src2 */
6500	for (i = 0; i < lasti + 1; i++) {
6501		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6502			continue;
6503
6504		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6505		alu.op = ALU_OP2_MUL;
6506		alu.src[0].sel = ctx->temp_reg;
6507		alu.src[0].chan = i;
6508		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6509		alu.dst.sel = ctx->temp_reg;
6510		alu.dst.chan = i;
6511		if (i == lasti) {
6512			alu.last = 1;
6513		}
6514		alu.dst.write = 1;
6515		r = r600_bytecode_add_alu(ctx->bc, &alu);
6516		if (r)
6517			return r;
6518	}
6519
6520	/* src0 * src1 + (1 - src0) * src2 */
6521        if (ctx->src[0].abs)
6522		temp_regs[0] = r600_get_temp(ctx);
6523	else
6524		temp_regs[0] = 0;
6525	if (ctx->src[1].abs)
6526		temp_regs[1] = r600_get_temp(ctx);
6527	else
6528		temp_regs[1] = 0;
6529
6530	for (i = 0; i < lasti + 1; i++) {
6531		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6532			continue;
6533
6534		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6535		alu.op = ALU_OP3_MULADD;
6536		alu.is_op3 = 1;
6537		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6538		if (r)
6539			return r;
6540		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6541		if (r)
6542			return r;
6543		alu.src[2].sel = ctx->temp_reg;
6544		alu.src[2].chan = i;
6545
6546		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6547		alu.dst.chan = i;
6548		if (i == lasti) {
6549			alu.last = 1;
6550		}
6551		r = r600_bytecode_add_alu(ctx->bc, &alu);
6552		if (r)
6553			return r;
6554	}
6555	return 0;
6556}
6557
6558static int tgsi_cmp(struct r600_shader_ctx *ctx)
6559{
6560	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6561	struct r600_bytecode_alu alu;
6562	int i, r, j;
6563	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6564	int temp_regs[3];
6565
6566	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6567		temp_regs[j] = 0;
6568		if (ctx->src[j].abs)
6569			temp_regs[j] = r600_get_temp(ctx);
6570	}
6571
6572	for (i = 0; i < lasti + 1; i++) {
6573		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6574			continue;
6575
6576		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6577		alu.op = ALU_OP3_CNDGE;
6578		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6579		if (r)
6580			return r;
6581		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6582		if (r)
6583			return r;
6584		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6585		if (r)
6586			return r;
6587		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6588		alu.dst.chan = i;
6589		alu.dst.write = 1;
6590		alu.is_op3 = 1;
6591		if (i == lasti)
6592			alu.last = 1;
6593		r = r600_bytecode_add_alu(ctx->bc, &alu);
6594		if (r)
6595			return r;
6596	}
6597	return 0;
6598}
6599
6600static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6601{
6602	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6603	struct r600_bytecode_alu alu;
6604	int i, r;
6605	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6606
6607	for (i = 0; i < lasti + 1; i++) {
6608		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6609			continue;
6610
6611		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6612		alu.op = ALU_OP3_CNDE_INT;
6613		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6614		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6615		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6616		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6617		alu.dst.chan = i;
6618		alu.dst.write = 1;
6619		alu.is_op3 = 1;
6620		if (i == lasti)
6621			alu.last = 1;
6622		r = r600_bytecode_add_alu(ctx->bc, &alu);
6623		if (r)
6624			return r;
6625	}
6626	return 0;
6627}
6628
6629static int tgsi_xpd(struct r600_shader_ctx *ctx)
6630{
6631	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6632	static const unsigned int src0_swizzle[] = {2, 0, 1};
6633	static const unsigned int src1_swizzle[] = {1, 2, 0};
6634	struct r600_bytecode_alu alu;
6635	uint32_t use_temp = 0;
6636	int i, r;
6637
6638	if (inst->Dst[0].Register.WriteMask != 0xf)
6639		use_temp = 1;
6640
6641	for (i = 0; i < 4; i++) {
6642		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6643		alu.op = ALU_OP2_MUL;
6644		if (i < 3) {
6645			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6646			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6647		} else {
6648			alu.src[0].sel = V_SQ_ALU_SRC_0;
6649			alu.src[0].chan = i;
6650			alu.src[1].sel = V_SQ_ALU_SRC_0;
6651			alu.src[1].chan = i;
6652		}
6653
6654		alu.dst.sel = ctx->temp_reg;
6655		alu.dst.chan = i;
6656		alu.dst.write = 1;
6657
6658		if (i == 3)
6659			alu.last = 1;
6660		r = r600_bytecode_add_alu(ctx->bc, &alu);
6661		if (r)
6662			return r;
6663	}
6664
6665	for (i = 0; i < 4; i++) {
6666		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6667		alu.op = ALU_OP3_MULADD;
6668
6669		if (i < 3) {
6670			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6671			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6672		} else {
6673			alu.src[0].sel = V_SQ_ALU_SRC_0;
6674			alu.src[0].chan = i;
6675			alu.src[1].sel = V_SQ_ALU_SRC_0;
6676			alu.src[1].chan = i;
6677		}
6678
6679		alu.src[2].sel = ctx->temp_reg;
6680		alu.src[2].neg = 1;
6681		alu.src[2].chan = i;
6682
6683		if (use_temp)
6684			alu.dst.sel = ctx->temp_reg;
6685		else
6686			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6687		alu.dst.chan = i;
6688		alu.dst.write = 1;
6689		alu.is_op3 = 1;
6690		if (i == 3)
6691			alu.last = 1;
6692		r = r600_bytecode_add_alu(ctx->bc, &alu);
6693		if (r)
6694			return r;
6695	}
6696	if (use_temp)
6697		return tgsi_helper_copy(ctx, inst);
6698	return 0;
6699}
6700
6701static int tgsi_exp(struct r600_shader_ctx *ctx)
6702{
6703	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6704	struct r600_bytecode_alu alu;
6705	int r;
6706	int i;
6707
6708	/* result.x = 2^floor(src); */
6709	if (inst->Dst[0].Register.WriteMask & 1) {
6710		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6711
6712		alu.op = ALU_OP1_FLOOR;
6713		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6714
6715		alu.dst.sel = ctx->temp_reg;
6716		alu.dst.chan = 0;
6717		alu.dst.write = 1;
6718		alu.last = 1;
6719		r = r600_bytecode_add_alu(ctx->bc, &alu);
6720		if (r)
6721			return r;
6722
6723		if (ctx->bc->chip_class == CAYMAN) {
6724			for (i = 0; i < 3; i++) {
6725				alu.op = ALU_OP1_EXP_IEEE;
6726				alu.src[0].sel = ctx->temp_reg;
6727				alu.src[0].chan = 0;
6728
6729				alu.dst.sel = ctx->temp_reg;
6730				alu.dst.chan = i;
6731				alu.dst.write = i == 0;
6732				alu.last = i == 2;
6733				r = r600_bytecode_add_alu(ctx->bc, &alu);
6734				if (r)
6735					return r;
6736			}
6737		} else {
6738			alu.op = ALU_OP1_EXP_IEEE;
6739			alu.src[0].sel = ctx->temp_reg;
6740			alu.src[0].chan = 0;
6741
6742			alu.dst.sel = ctx->temp_reg;
6743			alu.dst.chan = 0;
6744			alu.dst.write = 1;
6745			alu.last = 1;
6746			r = r600_bytecode_add_alu(ctx->bc, &alu);
6747			if (r)
6748				return r;
6749		}
6750	}
6751
6752	/* result.y = tmp - floor(tmp); */
6753	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6754		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6755
6756		alu.op = ALU_OP1_FRACT;
6757		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6758
6759		alu.dst.sel = ctx->temp_reg;
6760#if 0
6761		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6762		if (r)
6763			return r;
6764#endif
6765		alu.dst.write = 1;
6766		alu.dst.chan = 1;
6767
6768		alu.last = 1;
6769
6770		r = r600_bytecode_add_alu(ctx->bc, &alu);
6771		if (r)
6772			return r;
6773	}
6774
6775	/* result.z = RoughApprox2ToX(tmp);*/
6776	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6777		if (ctx->bc->chip_class == CAYMAN) {
6778			for (i = 0; i < 3; i++) {
6779				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6780				alu.op = ALU_OP1_EXP_IEEE;
6781				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6782
6783				alu.dst.sel = ctx->temp_reg;
6784				alu.dst.chan = i;
6785				if (i == 2) {
6786					alu.dst.write = 1;
6787					alu.last = 1;
6788				}
6789
6790				r = r600_bytecode_add_alu(ctx->bc, &alu);
6791				if (r)
6792					return r;
6793			}
6794		} else {
6795			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6796			alu.op = ALU_OP1_EXP_IEEE;
6797			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6798
6799			alu.dst.sel = ctx->temp_reg;
6800			alu.dst.write = 1;
6801			alu.dst.chan = 2;
6802
6803			alu.last = 1;
6804
6805			r = r600_bytecode_add_alu(ctx->bc, &alu);
6806			if (r)
6807				return r;
6808		}
6809	}
6810
6811	/* result.w = 1.0;*/
6812	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6813		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6814
6815		alu.op = ALU_OP1_MOV;
6816		alu.src[0].sel = V_SQ_ALU_SRC_1;
6817		alu.src[0].chan = 0;
6818
6819		alu.dst.sel = ctx->temp_reg;
6820		alu.dst.chan = 3;
6821		alu.dst.write = 1;
6822		alu.last = 1;
6823		r = r600_bytecode_add_alu(ctx->bc, &alu);
6824		if (r)
6825			return r;
6826	}
6827	return tgsi_helper_copy(ctx, inst);
6828}
6829
6830static int tgsi_log(struct r600_shader_ctx *ctx)
6831{
6832	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6833	struct r600_bytecode_alu alu;
6834	int r;
6835	int i;
6836
6837	/* result.x = floor(log2(|src|)); */
6838	if (inst->Dst[0].Register.WriteMask & 1) {
6839		if (ctx->bc->chip_class == CAYMAN) {
6840			for (i = 0; i < 3; i++) {
6841				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6842
6843				alu.op = ALU_OP1_LOG_IEEE;
6844				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6845				r600_bytecode_src_set_abs(&alu.src[0]);
6846
6847				alu.dst.sel = ctx->temp_reg;
6848				alu.dst.chan = i;
6849				if (i == 0)
6850					alu.dst.write = 1;
6851				if (i == 2)
6852					alu.last = 1;
6853				r = r600_bytecode_add_alu(ctx->bc, &alu);
6854				if (r)
6855					return r;
6856			}
6857
6858		} else {
6859			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6860
6861			alu.op = ALU_OP1_LOG_IEEE;
6862			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6863			r600_bytecode_src_set_abs(&alu.src[0]);
6864
6865			alu.dst.sel = ctx->temp_reg;
6866			alu.dst.chan = 0;
6867			alu.dst.write = 1;
6868			alu.last = 1;
6869			r = r600_bytecode_add_alu(ctx->bc, &alu);
6870			if (r)
6871				return r;
6872		}
6873
6874		alu.op = ALU_OP1_FLOOR;
6875		alu.src[0].sel = ctx->temp_reg;
6876		alu.src[0].chan = 0;
6877
6878		alu.dst.sel = ctx->temp_reg;
6879		alu.dst.chan = 0;
6880		alu.dst.write = 1;
6881		alu.last = 1;
6882
6883		r = r600_bytecode_add_alu(ctx->bc, &alu);
6884		if (r)
6885			return r;
6886	}
6887
6888	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
6889	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6890
6891		if (ctx->bc->chip_class == CAYMAN) {
6892			for (i = 0; i < 3; i++) {
6893				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6894
6895				alu.op = ALU_OP1_LOG_IEEE;
6896				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6897				r600_bytecode_src_set_abs(&alu.src[0]);
6898
6899				alu.dst.sel = ctx->temp_reg;
6900				alu.dst.chan = i;
6901				if (i == 1)
6902					alu.dst.write = 1;
6903				if (i == 2)
6904					alu.last = 1;
6905
6906				r = r600_bytecode_add_alu(ctx->bc, &alu);
6907				if (r)
6908					return r;
6909			}
6910		} else {
6911			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6912
6913			alu.op = ALU_OP1_LOG_IEEE;
6914			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6915			r600_bytecode_src_set_abs(&alu.src[0]);
6916
6917			alu.dst.sel = ctx->temp_reg;
6918			alu.dst.chan = 1;
6919			alu.dst.write = 1;
6920			alu.last = 1;
6921
6922			r = r600_bytecode_add_alu(ctx->bc, &alu);
6923			if (r)
6924				return r;
6925		}
6926
6927		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6928
6929		alu.op = ALU_OP1_FLOOR;
6930		alu.src[0].sel = ctx->temp_reg;
6931		alu.src[0].chan = 1;
6932
6933		alu.dst.sel = ctx->temp_reg;
6934		alu.dst.chan = 1;
6935		alu.dst.write = 1;
6936		alu.last = 1;
6937
6938		r = r600_bytecode_add_alu(ctx->bc, &alu);
6939		if (r)
6940			return r;
6941
6942		if (ctx->bc->chip_class == CAYMAN) {
6943			for (i = 0; i < 3; i++) {
6944				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6945				alu.op = ALU_OP1_EXP_IEEE;
6946				alu.src[0].sel = ctx->temp_reg;
6947				alu.src[0].chan = 1;
6948
6949				alu.dst.sel = ctx->temp_reg;
6950				alu.dst.chan = i;
6951				if (i == 1)
6952					alu.dst.write = 1;
6953				if (i == 2)
6954					alu.last = 1;
6955
6956				r = r600_bytecode_add_alu(ctx->bc, &alu);
6957				if (r)
6958					return r;
6959			}
6960		} else {
6961			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6962			alu.op = ALU_OP1_EXP_IEEE;
6963			alu.src[0].sel = ctx->temp_reg;
6964			alu.src[0].chan = 1;
6965
6966			alu.dst.sel = ctx->temp_reg;
6967			alu.dst.chan = 1;
6968			alu.dst.write = 1;
6969			alu.last = 1;
6970
6971			r = r600_bytecode_add_alu(ctx->bc, &alu);
6972			if (r)
6973				return r;
6974		}
6975
6976		if (ctx->bc->chip_class == CAYMAN) {
6977			for (i = 0; i < 3; i++) {
6978				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6979				alu.op = ALU_OP1_RECIP_IEEE;
6980				alu.src[0].sel = ctx->temp_reg;
6981				alu.src[0].chan = 1;
6982
6983				alu.dst.sel = ctx->temp_reg;
6984				alu.dst.chan = i;
6985				if (i == 1)
6986					alu.dst.write = 1;
6987				if (i == 2)
6988					alu.last = 1;
6989
6990				r = r600_bytecode_add_alu(ctx->bc, &alu);
6991				if (r)
6992					return r;
6993			}
6994		} else {
6995			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6996			alu.op = ALU_OP1_RECIP_IEEE;
6997			alu.src[0].sel = ctx->temp_reg;
6998			alu.src[0].chan = 1;
6999
7000			alu.dst.sel = ctx->temp_reg;
7001			alu.dst.chan = 1;
7002			alu.dst.write = 1;
7003			alu.last = 1;
7004
7005			r = r600_bytecode_add_alu(ctx->bc, &alu);
7006			if (r)
7007				return r;
7008		}
7009
7010		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7011
7012		alu.op = ALU_OP2_MUL;
7013
7014		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7015		r600_bytecode_src_set_abs(&alu.src[0]);
7016
7017		alu.src[1].sel = ctx->temp_reg;
7018		alu.src[1].chan = 1;
7019
7020		alu.dst.sel = ctx->temp_reg;
7021		alu.dst.chan = 1;
7022		alu.dst.write = 1;
7023		alu.last = 1;
7024
7025		r = r600_bytecode_add_alu(ctx->bc, &alu);
7026		if (r)
7027			return r;
7028	}
7029
7030	/* result.z = log2(|src|);*/
7031	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7032		if (ctx->bc->chip_class == CAYMAN) {
7033			for (i = 0; i < 3; i++) {
7034				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7035
7036				alu.op = ALU_OP1_LOG_IEEE;
7037				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7038				r600_bytecode_src_set_abs(&alu.src[0]);
7039
7040				alu.dst.sel = ctx->temp_reg;
7041				if (i == 2)
7042					alu.dst.write = 1;
7043				alu.dst.chan = i;
7044				if (i == 2)
7045					alu.last = 1;
7046
7047				r = r600_bytecode_add_alu(ctx->bc, &alu);
7048				if (r)
7049					return r;
7050			}
7051		} else {
7052			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7053
7054			alu.op = ALU_OP1_LOG_IEEE;
7055			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7056			r600_bytecode_src_set_abs(&alu.src[0]);
7057
7058			alu.dst.sel = ctx->temp_reg;
7059			alu.dst.write = 1;
7060			alu.dst.chan = 2;
7061			alu.last = 1;
7062
7063			r = r600_bytecode_add_alu(ctx->bc, &alu);
7064			if (r)
7065				return r;
7066		}
7067	}
7068
7069	/* result.w = 1.0; */
7070	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7071		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7072
7073		alu.op = ALU_OP1_MOV;
7074		alu.src[0].sel = V_SQ_ALU_SRC_1;
7075		alu.src[0].chan = 0;
7076
7077		alu.dst.sel = ctx->temp_reg;
7078		alu.dst.chan = 3;
7079		alu.dst.write = 1;
7080		alu.last = 1;
7081
7082		r = r600_bytecode_add_alu(ctx->bc, &alu);
7083		if (r)
7084			return r;
7085	}
7086
7087	return tgsi_helper_copy(ctx, inst);
7088}
7089
7090static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7091{
7092	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7093	struct r600_bytecode_alu alu;
7094	int r;
7095	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7096	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
7097
7098	assert(inst->Dst[0].Register.Index < 3);
7099	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7100
7101	switch (inst->Instruction.Opcode) {
7102	case TGSI_OPCODE_ARL:
7103		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7104		break;
7105	case TGSI_OPCODE_ARR:
7106		alu.op = ALU_OP1_FLT_TO_INT;
7107		break;
7108	case TGSI_OPCODE_UARL:
7109		alu.op = ALU_OP1_MOV;
7110		break;
7111	default:
7112		assert(0);
7113		return -1;
7114	}
7115
7116	for (i = 0; i <= lasti; ++i) {
7117		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7118			continue;
7119		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7120		alu.last = i == lasti;
7121		alu.dst.sel = reg;
7122	        alu.dst.chan = i;
7123		alu.dst.write = 1;
7124		r = r600_bytecode_add_alu(ctx->bc, &alu);
7125		if (r)
7126			return r;
7127	}
7128
7129	if (inst->Dst[0].Register.Index > 0)
7130		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7131	else
7132		ctx->bc->ar_loaded = 0;
7133
7134	return 0;
7135}
7136static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7137{
7138	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7139	struct r600_bytecode_alu alu;
7140	int r;
7141	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7142
7143	switch (inst->Instruction.Opcode) {
7144	case TGSI_OPCODE_ARL:
7145		memset(&alu, 0, sizeof(alu));
7146		alu.op = ALU_OP1_FLOOR;
7147		alu.dst.sel = ctx->bc->ar_reg;
7148		alu.dst.write = 1;
7149		for (i = 0; i <= lasti; ++i) {
7150			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
7151				alu.dst.chan = i;
7152				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7153				alu.last = i == lasti;
7154				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7155					return r;
7156			}
7157		}
7158
7159		memset(&alu, 0, sizeof(alu));
7160		alu.op = ALU_OP1_FLT_TO_INT;
7161		alu.src[0].sel = ctx->bc->ar_reg;
7162		alu.dst.sel = ctx->bc->ar_reg;
7163		alu.dst.write = 1;
7164		/* FLT_TO_INT is trans-only on r600/r700 */
7165		alu.last = TRUE;
7166		for (i = 0; i <= lasti; ++i) {
7167			alu.dst.chan = i;
7168			alu.src[0].chan = i;
7169			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7170				return r;
7171		}
7172		break;
7173	case TGSI_OPCODE_ARR:
7174		memset(&alu, 0, sizeof(alu));
7175		alu.op = ALU_OP1_FLT_TO_INT;
7176		alu.dst.sel = ctx->bc->ar_reg;
7177		alu.dst.write = 1;
7178		/* FLT_TO_INT is trans-only on r600/r700 */
7179		alu.last = TRUE;
7180		for (i = 0; i <= lasti; ++i) {
7181			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7182				alu.dst.chan = i;
7183				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7184				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7185					return r;
7186			}
7187		}
7188		break;
7189	case TGSI_OPCODE_UARL:
7190		memset(&alu, 0, sizeof(alu));
7191		alu.op = ALU_OP1_MOV;
7192		alu.dst.sel = ctx->bc->ar_reg;
7193		alu.dst.write = 1;
7194		for (i = 0; i <= lasti; ++i) {
7195			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7196				alu.dst.chan = i;
7197				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7198				alu.last = i == lasti;
7199				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7200					return r;
7201			}
7202		}
7203		break;
7204	default:
7205		assert(0);
7206		return -1;
7207	}
7208
7209	ctx->bc->ar_loaded = 0;
7210	return 0;
7211}
7212
7213static int tgsi_opdst(struct r600_shader_ctx *ctx)
7214{
7215	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7216	struct r600_bytecode_alu alu;
7217	int i, r = 0;
7218
7219	for (i = 0; i < 4; i++) {
7220		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7221
7222		alu.op = ALU_OP2_MUL;
7223		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7224
7225		if (i == 0 || i == 3) {
7226			alu.src[0].sel = V_SQ_ALU_SRC_1;
7227		} else {
7228			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7229		}
7230
7231		if (i == 0 || i == 2) {
7232			alu.src[1].sel = V_SQ_ALU_SRC_1;
7233		} else {
7234			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7235		}
7236		if (i == 3)
7237			alu.last = 1;
7238		r = r600_bytecode_add_alu(ctx->bc, &alu);
7239		if (r)
7240			return r;
7241	}
7242	return 0;
7243}
7244
7245static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7246{
7247	struct r600_bytecode_alu alu;
7248	int r;
7249
7250	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7251	alu.op = opcode;
7252	alu.execute_mask = 1;
7253	alu.update_pred = 1;
7254
7255	alu.dst.sel = ctx->temp_reg;
7256	alu.dst.write = 1;
7257	alu.dst.chan = 0;
7258
7259	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7260	alu.src[1].sel = V_SQ_ALU_SRC_0;
7261	alu.src[1].chan = 0;
7262
7263	alu.last = 1;
7264
7265	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7266	if (r)
7267		return r;
7268	return 0;
7269}
7270
7271static int pops(struct r600_shader_ctx *ctx, int pops)
7272{
7273	unsigned force_pop = ctx->bc->force_add_cf;
7274
7275	if (!force_pop) {
7276		int alu_pop = 3;
7277		if (ctx->bc->cf_last) {
7278			if (ctx->bc->cf_last->op == CF_OP_ALU)
7279				alu_pop = 0;
7280			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7281				alu_pop = 1;
7282		}
7283		alu_pop += pops;
7284		if (alu_pop == 1) {
7285			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7286			ctx->bc->force_add_cf = 1;
7287		} else if (alu_pop == 2) {
7288			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7289			ctx->bc->force_add_cf = 1;
7290		} else {
7291			force_pop = 1;
7292		}
7293	}
7294
7295	if (force_pop) {
7296		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7297		ctx->bc->cf_last->pop_count = pops;
7298		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7299	}
7300
7301	return 0;
7302}
7303
7304static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7305                                              unsigned reason)
7306{
7307	struct r600_stack_info *stack = &ctx->bc->stack;
7308	unsigned elements, entries;
7309
7310	unsigned entry_size = stack->entry_size;
7311
7312	elements = (stack->loop + stack->push_wqm ) * entry_size;
7313	elements += stack->push;
7314
7315	switch (ctx->bc->chip_class) {
7316	case R600:
7317	case R700:
7318		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7319		 * the stack must be reserved to hold the current active/continue
7320		 * masks */
7321		if (reason == FC_PUSH_VPM) {
7322			elements += 2;
7323		}
7324		break;
7325
7326	case CAYMAN:
7327		/* r9xx: any stack operation on empty stack consumes 2 additional
7328		 * elements */
7329		elements += 2;
7330
7331		/* fallthrough */
7332		/* FIXME: do the two elements added above cover the cases for the
7333		 * r8xx+ below? */
7334
7335	case EVERGREEN:
7336		/* r8xx+: 2 extra elements are not always required, but one extra
7337		 * element must be added for each of the following cases:
7338		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7339		 *    stack usage.
7340		 *    (Currently we don't use ALU_ELSE_AFTER.)
7341		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7342		 *    PUSH instruction executed.
7343		 *
7344		 *    NOTE: it seems we also need to reserve additional element in some
7345		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7346		 *    then STACK_SIZE should be 2 instead of 1 */
7347		if (reason == FC_PUSH_VPM) {
7348			elements += 1;
7349		}
7350		break;
7351
7352	default:
7353		assert(0);
7354		break;
7355	}
7356
7357	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7358	 * for all chips, so we use 4 in the final formula, not the real entry_size
7359	 * for the chip */
7360	entry_size = 4;
7361
7362	entries = (elements + (entry_size - 1)) / entry_size;
7363
7364	if (entries > stack->max_entries)
7365		stack->max_entries = entries;
7366}
7367
7368static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7369{
7370	switch(reason) {
7371	case FC_PUSH_VPM:
7372		--ctx->bc->stack.push;
7373		assert(ctx->bc->stack.push >= 0);
7374		break;
7375	case FC_PUSH_WQM:
7376		--ctx->bc->stack.push_wqm;
7377		assert(ctx->bc->stack.push_wqm >= 0);
7378		break;
7379	case FC_LOOP:
7380		--ctx->bc->stack.loop;
7381		assert(ctx->bc->stack.loop >= 0);
7382		break;
7383	default:
7384		assert(0);
7385		break;
7386	}
7387}
7388
7389static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7390{
7391	switch (reason) {
7392	case FC_PUSH_VPM:
7393		++ctx->bc->stack.push;
7394		break;
7395	case FC_PUSH_WQM:
7396		++ctx->bc->stack.push_wqm;
7397	case FC_LOOP:
7398		++ctx->bc->stack.loop;
7399		break;
7400	default:
7401		assert(0);
7402	}
7403
7404	callstack_update_max_depth(ctx, reason);
7405}
7406
7407static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7408{
7409	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7410
7411	sp->mid = realloc((void *)sp->mid,
7412						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7413	sp->mid[sp->num_mid] = ctx->bc->cf_last;
7414	sp->num_mid++;
7415}
7416
7417static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7418{
7419	ctx->bc->fc_sp++;
7420	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7421	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7422}
7423
7424static void fc_poplevel(struct r600_shader_ctx *ctx)
7425{
7426	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7427	free(sp->mid);
7428	sp->mid = NULL;
7429	sp->num_mid = 0;
7430	sp->start = NULL;
7431	sp->type = 0;
7432	ctx->bc->fc_sp--;
7433}
7434
7435#if 0
7436static int emit_return(struct r600_shader_ctx *ctx)
7437{
7438	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7439	return 0;
7440}
7441
7442static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7443{
7444
7445	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7446	ctx->bc->cf_last->pop_count = pops;
7447	/* XXX work out offset */
7448	return 0;
7449}
7450
7451static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7452{
7453	return 0;
7454}
7455
7456static void emit_testflag(struct r600_shader_ctx *ctx)
7457{
7458
7459}
7460
7461static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7462{
7463	emit_testflag(ctx);
7464	emit_jump_to_offset(ctx, 1, 4);
7465	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7466	pops(ctx, ifidx + 1);
7467	emit_return(ctx);
7468}
7469
7470static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7471{
7472	emit_testflag(ctx);
7473
7474	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7475	ctx->bc->cf_last->pop_count = 1;
7476
7477	fc_set_mid(ctx, fc_sp);
7478
7479	pops(ctx, 1);
7480}
7481#endif
7482
7483static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7484{
7485	int alu_type = CF_OP_ALU_PUSH_BEFORE;
7486
7487	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7488	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7489	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7490	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7491	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7492		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7493		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7494		alu_type = CF_OP_ALU;
7495	}
7496
7497	emit_logic_pred(ctx, opcode, alu_type);
7498
7499	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7500
7501	fc_pushlevel(ctx, FC_IF);
7502
7503	callstack_push(ctx, FC_PUSH_VPM);
7504	return 0;
7505}
7506
7507static int tgsi_if(struct r600_shader_ctx *ctx)
7508{
7509	return emit_if(ctx, ALU_OP2_PRED_SETNE);
7510}
7511
7512static int tgsi_uif(struct r600_shader_ctx *ctx)
7513{
7514	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7515}
7516
7517static int tgsi_else(struct r600_shader_ctx *ctx)
7518{
7519	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7520	ctx->bc->cf_last->pop_count = 1;
7521
7522	fc_set_mid(ctx, ctx->bc->fc_sp);
7523	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7524	return 0;
7525}
7526
7527static int tgsi_endif(struct r600_shader_ctx *ctx)
7528{
7529	pops(ctx, 1);
7530	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7531		R600_ERR("if/endif unbalanced in shader\n");
7532		return -1;
7533	}
7534
7535	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7536		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7537		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7538	} else {
7539		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7540	}
7541	fc_poplevel(ctx);
7542
7543	callstack_pop(ctx, FC_PUSH_VPM);
7544	return 0;
7545}
7546
7547static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7548{
7549	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7550	 * limited to 4096 iterations, like the other LOOP_* instructions. */
7551	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7552
7553	fc_pushlevel(ctx, FC_LOOP);
7554
7555	/* check stack depth */
7556	callstack_push(ctx, FC_LOOP);
7557	return 0;
7558}
7559
7560static int tgsi_endloop(struct r600_shader_ctx *ctx)
7561{
7562	int i;
7563
7564	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7565
7566	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7567		R600_ERR("loop/endloop in shader code are not paired.\n");
7568		return -EINVAL;
7569	}
7570
7571	/* fixup loop pointers - from r600isa
7572	   LOOP END points to CF after LOOP START,
7573	   LOOP START point to CF after LOOP END
7574	   BRK/CONT point to LOOP END CF
7575	*/
7576	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7577
7578	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7579
7580	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7581		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7582	}
7583	/* XXX add LOOPRET support */
7584	fc_poplevel(ctx);
7585	callstack_pop(ctx, FC_LOOP);
7586	return 0;
7587}
7588
7589static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7590{
7591	int r;
7592	unsigned int fscp;
7593
7594	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7595	{
7596		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7597			break;
7598	}
7599	if (fscp == 0) {
7600		R600_ERR("BREAKC not inside loop/endloop pair\n");
7601		return -EINVAL;
7602	}
7603
7604	if (ctx->bc->chip_class == EVERGREEN &&
7605	    ctx->bc->family != CHIP_CYPRESS &&
7606	    ctx->bc->family != CHIP_JUNIPER) {
7607		/* HW bug: ALU_BREAK does not save the active mask correctly */
7608		r = tgsi_uif(ctx);
7609		if (r)
7610			return r;
7611
7612		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7613		if (r)
7614			return r;
7615		fc_set_mid(ctx, fscp);
7616
7617		return tgsi_endif(ctx);
7618	} else {
7619		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7620		if (r)
7621			return r;
7622		fc_set_mid(ctx, fscp);
7623	}
7624
7625	return 0;
7626}
7627
7628static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7629{
7630	unsigned int fscp;
7631
7632	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7633	{
7634		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7635			break;
7636	}
7637
7638	if (fscp == 0) {
7639		R600_ERR("Break not inside loop/endloop pair\n");
7640		return -EINVAL;
7641	}
7642
7643	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7644
7645	fc_set_mid(ctx, fscp);
7646
7647	return 0;
7648}
7649
7650static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7651{
7652	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7653	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
7654	int r;
7655
7656	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7657		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
7658
7659	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7660	if (!r)
7661		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
7662	return r;
7663}
7664
7665static int tgsi_umad(struct r600_shader_ctx *ctx)
7666{
7667	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7668	struct r600_bytecode_alu alu;
7669	int i, j, k, r;
7670	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7671
7672	/* src0 * src1 */
7673	for (i = 0; i < lasti + 1; i++) {
7674		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7675			continue;
7676
7677		if (ctx->bc->chip_class == CAYMAN) {
7678			for (j = 0 ; j < 4; j++) {
7679				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7680
7681				alu.op = ALU_OP2_MULLO_UINT;
7682				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7683					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7684				}
7685				alu.dst.chan = j;
7686				alu.dst.sel = ctx->temp_reg;
7687				alu.dst.write = (j == i);
7688				if (j == 3)
7689					alu.last = 1;
7690				r = r600_bytecode_add_alu(ctx->bc, &alu);
7691				if (r)
7692					return r;
7693			}
7694		} else {
7695			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7696
7697			alu.dst.chan = i;
7698			alu.dst.sel = ctx->temp_reg;
7699			alu.dst.write = 1;
7700
7701			alu.op = ALU_OP2_MULLO_UINT;
7702			for (j = 0; j < 2; j++) {
7703				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7704			}
7705
7706			alu.last = 1;
7707			r = r600_bytecode_add_alu(ctx->bc, &alu);
7708			if (r)
7709				return r;
7710		}
7711	}
7712
7713
7714	for (i = 0; i < lasti + 1; i++) {
7715		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7716			continue;
7717
7718		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7719		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7720
7721		alu.op = ALU_OP2_ADD_INT;
7722
7723		alu.src[0].sel = ctx->temp_reg;
7724		alu.src[0].chan = i;
7725
7726		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7727		if (i == lasti) {
7728			alu.last = 1;
7729		}
7730		r = r600_bytecode_add_alu(ctx->bc, &alu);
7731		if (r)
7732			return r;
7733	}
7734	return 0;
7735}
7736
7737static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7738	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
7739	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
7740	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
7741
7742	/* XXX:
7743	 * For state trackers other than OpenGL, we'll want to use
7744	 * _RECIP_IEEE instead.
7745	 */
7746	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7747
7748	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
7749	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
7750	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
7751	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
7752	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
7753	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
7754	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
7755	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
7756	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
7757	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
7758	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
7759	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
7760	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
7761	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
7762	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
7763	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
7764	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7765	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
7766	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
7767	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
7768	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
7769	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
7770	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
7771	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
7772	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7773	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7774	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
7775	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
7776	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
7777	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
7778	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
7779	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
7780	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
7781	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7782	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7783	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
7784	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7785	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
7786	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
7787	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
7788	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
7789	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
7790	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
7791	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
7792	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
7793	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
7794	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
7795	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
7796	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
7797	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
7798	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
7799	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7800	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
7801	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
7802	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
7803	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
7804	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
7805	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
7806	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
7807	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
7808	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
7809	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
7810	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
7811	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
7812	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
7813	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
7814	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
7815	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
7816	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
7817	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7818	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
7819	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
7820	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
7821	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
7822	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
7823	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
7824	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
7825	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
7826	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
7827	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
7828	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7829	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
7830	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
7831	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7832	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
7833	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
7834	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
7835	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
7836	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
7837	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
7838	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
7839	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7840	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7841	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7842	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7843	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
7844	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
7845	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
7846	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
7847	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7848	[104]			= { ALU_OP0_NOP, tgsi_unsupported},
7849	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
7850	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
7851	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
7852	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
7853	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
7854	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7855	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7856	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
7857	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
7858	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
7859	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
7860	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
7861	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
7862	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
7863	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
7864	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
7865	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
7866	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
7867	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
7868	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
7869	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
7870	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7871	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
7872	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7873	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
7874	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
7875	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
7876	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
7877	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
7878	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
7879	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7880	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
7881	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
7882	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
7883	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7884	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
7885	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
7886	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
7887	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
7888	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
7889	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
7890	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
7891	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
7892	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
7893	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
7894	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
7895	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
7896	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
7897	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
7898	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
7899	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
7900	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
7901	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
7902	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
7903	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
7904	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
7905	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
7906	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
7907	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
7908	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
7909	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
7910	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
7911	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
7912	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
7913	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
7914	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
7915	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
7916	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
7917	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
7918	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
7919	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
7920	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
7921	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
7922	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
7923	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
7924	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
7925	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
7926	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
7927	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
7928	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
7929	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
7930	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
7931	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
7932	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
7933	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
7934	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
7935	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
7936	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
7937	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
7938	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
7939	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
7940};
7941
7942static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
7943	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
7944	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
7945	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
7946	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
7947	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
7948	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
7949	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
7950	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
7951	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
7952	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
7953	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
7954	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
7955	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
7956	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
7957	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
7958	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
7959	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
7960	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
7961	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
7962	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
7963	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7964	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
7965	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
7966	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
7967	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
7968	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
7969	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
7970	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
7971	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7972	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7973	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
7974	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
7975	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
7976	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
7977	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
7978	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
7979	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
7980	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7981	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7982	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
7983	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7984	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
7985	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
7986	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
7987	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
7988	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
7989	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
7990	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
7991	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
7992	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
7993	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
7994	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
7995	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
7996	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
7997	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
7998	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7999	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8000	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8001	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8002	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8003	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8004	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8005	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8006	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8007	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8008	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8009	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8010	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8011	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8012	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8013	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8014	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8015	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8016	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8017	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8018	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8019	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8020	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8021	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8022	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8023	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8024	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8025	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8026	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8027	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8028	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8029	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8030	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8031	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8032	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8033	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8034	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8035	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8036	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8037	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8038	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8039	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8040	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8041	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8042	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8043	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8044	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8045	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8046	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8047	[104]			= { ALU_OP0_NOP, tgsi_unsupported},
8048	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8049	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8050	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8051	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8052	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8053	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8054	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8055	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8056	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8057	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8058	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8059	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8060	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8061	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8062	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8063	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8064	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8065	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8066	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8067	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8068	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8069	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8070	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8071	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8072	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8073	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8074	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8075	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8076	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8077	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8078	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8079	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8080	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8081	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8082	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8083	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8084	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8085	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8086	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8087	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8088	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8089	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8090	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8091	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8092	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8093	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8094	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8095	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8096	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8097	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8098	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8099	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8100	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8101	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8102	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8103	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8104	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8105	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8106	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8107	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8108	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8109	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8110	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8111	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8112	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8113	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8114	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8115	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8116	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8117	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8118	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8119	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8120	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8121	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8122	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8123	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8124	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8125	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8126	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8127	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8128	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8129	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8130	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8131	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8132	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8133	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8134	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8135	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8136	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8137	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8138	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8139	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8140	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8141	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8142	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8143	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8144	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8145	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8146	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8147	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8148	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8149	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8150	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8151	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8152	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8153	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8154	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8155	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8156	[TGSI_OPCODE_D2I]	= { ALU_OP0_NOP, tgsi_unsupported},
8157	[TGSI_OPCODE_I2D]	= { ALU_OP0_NOP, tgsi_unsupported},
8158	[TGSI_OPCODE_D2U]	= { ALU_OP0_NOP, tgsi_unsupported},
8159	[TGSI_OPCODE_U2D]	= { ALU_OP0_NOP, tgsi_unsupported},
8160	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8161	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8162};
8163
8164static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8165	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8166	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8167	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8168	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8169	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8170	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8171	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8172	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8173	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8174	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8175	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8176	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8177	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8178	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8179	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8180	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8181	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8182	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8183	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8184	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8185	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8186	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8187	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8188	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8189	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8190	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8191	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8192	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8193	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8194	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8195	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
8196	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8197	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8198	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8199	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8200	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8201	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
8202	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8203	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8204	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8205	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8206	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8207	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8208	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8209	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8210	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8211	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8212	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8213	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
8214	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8215	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8216	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8217	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8218	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8219	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8220	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8221	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8222	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8223	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8224	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8225	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8226	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8227	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8228	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8229	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8230	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8231	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8232	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8233	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8234	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8235	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8236	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8237	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8238	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8239	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8240	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8241	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8242	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8243	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8244	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8245	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8246	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8247	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8248	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8249	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
8250	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8251	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8252	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8253	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8254	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8255	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8256	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8257	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8258	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8259	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8260	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8261	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8262	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8263	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8264	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8265	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8266	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8267	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8268	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8269	[104]			= { ALU_OP0_NOP, tgsi_unsupported},
8270	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8271	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8272	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8273	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8274	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8275	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8276	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8277	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8278	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8279	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8280	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8281	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8282	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8283	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8284	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
8285	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8286	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8287	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8288	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8289	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8290	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8291	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8292	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8293	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8294	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8295	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8296	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8297	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8298	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8299	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8300	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8301	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8302	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8303	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8304	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8305	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8306	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8307	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8308	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8309	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8310	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8311	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8312	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8313	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8314	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8315	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8316	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8317	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8318	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8319	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8320	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8321	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8322	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8323	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8324	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8325	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8326	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8327	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8328	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8329	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8330	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8331	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8332	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8333	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8334	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8335	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8336	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8337	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8338	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8339	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8340	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8341	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8342	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8343	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8344	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8345	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8346	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8347	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8348	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8349	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8350	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8351	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8352	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8353	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8354	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8355	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8356	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8357	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8358	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8359	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8360	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8361	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8362	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8363	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8364	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8365	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8366	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8367	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8368	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8369	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8370	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8371	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8372	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8373	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8374	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8375	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8376	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8377	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8378	[TGSI_OPCODE_D2I]	= { ALU_OP0_NOP, tgsi_unsupported},
8379	[TGSI_OPCODE_I2D]	= { ALU_OP0_NOP, tgsi_unsupported},
8380	[TGSI_OPCODE_D2U]	= { ALU_OP0_NOP, tgsi_unsupported},
8381	[TGSI_OPCODE_U2D]	= { ALU_OP0_NOP, tgsi_unsupported},
8382	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8383	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8384};
8385