r600_shader.c revision 0e49151dcfe042d937e1ac3c6eab86bb0a68cf04
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "sb/sb_public.h"
31
32#include "pipe/p_shader_tokens.h"
33#include "tgsi/tgsi_info.h"
34#include "tgsi/tgsi_parse.h"
35#include "tgsi/tgsi_scan.h"
36#include "tgsi/tgsi_dump.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64static int r600_shader_from_tgsi(struct r600_context *rctx,
65				 struct r600_pipe_shader *pipeshader,
66				 union r600_shader_key key);
67
68
69static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70                           int size, unsigned comp_mask) {
71
72	if (!size)
73		return;
74
75	if (ps->num_arrays == ps->max_arrays) {
76		ps->max_arrays += 64;
77		ps->arrays = realloc(ps->arrays, ps->max_arrays *
78		                     sizeof(struct r600_shader_array));
79	}
80
81	int n = ps->num_arrays;
82	++ps->num_arrays;
83
84	ps->arrays[n].comp_mask = comp_mask;
85	ps->arrays[n].gpr_start = start_gpr;
86	ps->arrays[n].gpr_count = size;
87}
88
89static void r600_dump_streamout(struct pipe_stream_output_info *so)
90{
91	unsigned i;
92
93	fprintf(stderr, "STREAMOUT\n");
94	for (i = 0; i < so->num_outputs; i++) {
95		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96				so->output[i].start_component;
97		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98			i,
99			so->output[i].stream,
100			so->output[i].output_buffer,
101			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102			so->output[i].register_index,
103			mask & 1 ? "x" : "",
104		        mask & 2 ? "y" : "",
105		        mask & 4 ? "z" : "",
106		        mask & 8 ? "w" : "",
107			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108	}
109}
110
111static int store_shader(struct pipe_context *ctx,
112			struct r600_pipe_shader *shader)
113{
114	struct r600_context *rctx = (struct r600_context *)ctx;
115	uint32_t *ptr, i;
116
117	if (shader->bo == NULL) {
118		shader->bo = (struct r600_resource*)
119			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120		if (shader->bo == NULL) {
121			return -ENOMEM;
122		}
123		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124		if (R600_BIG_ENDIAN) {
125			for (i = 0; i < shader->shader.bc.ndw; ++i) {
126				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127			}
128		} else {
129			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130		}
131		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132	}
133
134	return 0;
135}
136
137int r600_pipe_shader_create(struct pipe_context *ctx,
138			    struct r600_pipe_shader *shader,
139			    union r600_shader_key key)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	struct r600_pipe_shader_selector *sel = shader->selector;
143	int r;
144	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147	unsigned export_shader;
148
149	shader->shader.bc.isa = rctx->isa;
150
151	if (dump) {
152		fprintf(stderr, "--------------------------------------------------------------\n");
153		tgsi_dump(sel->tokens, 0);
154
155		if (sel->so.num_outputs) {
156			r600_dump_streamout(&sel->so);
157		}
158	}
159	r = r600_shader_from_tgsi(rctx, shader, key);
160	if (r) {
161		R600_ERR("translation from TGSI failed !\n");
162		goto error;
163	}
164
165	/* disable SB for shaders using doubles */
166	use_sb &= !shader->shader.uses_doubles;
167
168	/* Check if the bytecode has already been built.  When using the llvm
169	 * backend, r600_shader_from_tgsi() will take care of building the
170	 * bytecode.
171	 */
172	if (!shader->shader.bc.bytecode) {
173		r = r600_bytecode_build(&shader->shader.bc);
174		if (r) {
175			R600_ERR("building bytecode failed !\n");
176			goto error;
177		}
178	}
179
180	if (dump && !sb_disasm) {
181		fprintf(stderr, "--------------------------------------------------------------\n");
182		r600_bytecode_disasm(&shader->shader.bc);
183		fprintf(stderr, "______________________________________________________________\n");
184	} else if ((dump && sb_disasm) || use_sb) {
185		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
186		                             dump, use_sb);
187		if (r) {
188			R600_ERR("r600_sb_bytecode_process failed !\n");
189			goto error;
190		}
191	}
192
193	if (shader->gs_copy_shader) {
194		if (dump) {
195			// dump copy shader
196			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
197						     &shader->gs_copy_shader->shader, dump, 0);
198			if (r)
199				goto error;
200		}
201
202		if ((r = store_shader(ctx, shader->gs_copy_shader)))
203			goto error;
204	}
205
206	/* Store the shader in a buffer. */
207	if ((r = store_shader(ctx, shader)))
208		goto error;
209
210	/* Build state. */
211	switch (shader->shader.processor_type) {
212	case TGSI_PROCESSOR_GEOMETRY:
213		if (rctx->b.chip_class >= EVERGREEN) {
214			evergreen_update_gs_state(ctx, shader);
215			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
216		} else {
217			r600_update_gs_state(ctx, shader);
218			r600_update_vs_state(ctx, shader->gs_copy_shader);
219		}
220		break;
221	case TGSI_PROCESSOR_VERTEX:
222		export_shader = key.vs.as_es;
223		if (rctx->b.chip_class >= EVERGREEN) {
224			if (export_shader)
225				evergreen_update_es_state(ctx, shader);
226			else
227				evergreen_update_vs_state(ctx, shader);
228		} else {
229			if (export_shader)
230				r600_update_es_state(ctx, shader);
231			else
232				r600_update_vs_state(ctx, shader);
233		}
234		break;
235	case TGSI_PROCESSOR_FRAGMENT:
236		if (rctx->b.chip_class >= EVERGREEN) {
237			evergreen_update_ps_state(ctx, shader);
238		} else {
239			r600_update_ps_state(ctx, shader);
240		}
241		break;
242	default:
243		r = -EINVAL;
244		goto error;
245	}
246	return 0;
247
248error:
249	r600_pipe_shader_destroy(ctx, shader);
250	return r;
251}
252
253void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
254{
255	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
256	r600_bytecode_clear(&shader->shader.bc);
257	r600_release_command_buffer(&shader->command_buffer);
258}
259
260/*
261 * tgsi -> r600 shader
262 */
263struct r600_shader_tgsi_instruction;
264
265struct r600_shader_src {
266	unsigned				sel;
267	unsigned				swizzle[4];
268	unsigned				neg;
269	unsigned				abs;
270	unsigned				rel;
271	unsigned				kc_bank;
272	boolean					kc_rel; /* true if cache bank is indexed */
273	uint32_t				value[4];
274};
275
276struct eg_interp {
277	boolean					enabled;
278	unsigned				ij_index;
279};
280
281struct r600_shader_ctx {
282	struct tgsi_shader_info			info;
283	struct tgsi_parse_context		parse;
284	const struct tgsi_token			*tokens;
285	unsigned				type;
286	unsigned				file_offset[TGSI_FILE_COUNT];
287	unsigned				temp_reg;
288	const struct r600_shader_tgsi_instruction	*inst_info;
289	struct r600_bytecode			*bc;
290	struct r600_shader			*shader;
291	struct r600_shader_src			src[4];
292	uint32_t				*literals;
293	uint32_t				nliterals;
294	uint32_t				max_driver_temp_used;
295	boolean use_llvm;
296	/* needed for evergreen interpolation */
297	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
298	/* evergreen/cayman also store sample mask in face register */
299	int					face_gpr;
300	/* sample id is .w component stored in fixed point position register */
301	int					fixed_pt_position_gpr;
302	int					colors_used;
303	boolean                 clip_vertex_write;
304	unsigned                cv_output;
305	unsigned		edgeflag_output;
306	int					fragcoord_input;
307	int					native_integers;
308	int					next_ring_offset;
309	int					gs_out_ring_offset;
310	int					gs_next_vertex;
311	struct r600_shader	*gs_for_vs;
312	int					gs_export_gpr_tregs[4];
313	const struct pipe_stream_output_info	*gs_stream_output_info;
314	unsigned				enabled_stream_buffers_mask;
315};
316
317struct r600_shader_tgsi_instruction {
318	unsigned	op;
319	int (*process)(struct r600_shader_ctx *ctx);
320};
321
322static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
323static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
324static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
325static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
326static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
327static int tgsi_else(struct r600_shader_ctx *ctx);
328static int tgsi_endif(struct r600_shader_ctx *ctx);
329static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
330static int tgsi_endloop(struct r600_shader_ctx *ctx);
331static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
332static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
333                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
334                                unsigned int dst_reg);
335static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
336			const struct r600_shader_src *shader_src,
337			unsigned chan);
338
339static int tgsi_is_supported(struct r600_shader_ctx *ctx)
340{
341	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
342	int j;
343
344	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
345		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
346		return -EINVAL;
347	}
348	if (i->Instruction.Predicate) {
349		R600_ERR("predicate unsupported\n");
350		return -EINVAL;
351	}
352#if 0
353	if (i->Instruction.Label) {
354		R600_ERR("label unsupported\n");
355		return -EINVAL;
356	}
357#endif
358	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
359		if (i->Src[j].Register.Dimension) {
360		   switch (i->Src[j].Register.File) {
361		   case TGSI_FILE_CONSTANT:
362			   break;
363		   case TGSI_FILE_INPUT:
364			   if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
365				   break;
366		   default:
367			   R600_ERR("unsupported src %d (dimension %d)\n", j,
368				    i->Src[j].Register.Dimension);
369			   return -EINVAL;
370		   }
371		}
372	}
373	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
374		if (i->Dst[j].Register.Dimension) {
375			R600_ERR("unsupported dst (dimension)\n");
376			return -EINVAL;
377		}
378	}
379	return 0;
380}
381
382int eg_get_interpolator_index(unsigned interpolate, unsigned location)
383{
384	if (interpolate == TGSI_INTERPOLATE_COLOR ||
385		interpolate == TGSI_INTERPOLATE_LINEAR ||
386		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
387	{
388		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
389		int loc;
390
391		switch(location) {
392		case TGSI_INTERPOLATE_LOC_CENTER:
393			loc = 1;
394			break;
395		case TGSI_INTERPOLATE_LOC_CENTROID:
396			loc = 2;
397			break;
398		case TGSI_INTERPOLATE_LOC_SAMPLE:
399		default:
400			loc = 0; break;
401		}
402
403		return is_linear * 3 + loc;
404	}
405
406	return -1;
407}
408
409static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
410		int input)
411{
412	int i = eg_get_interpolator_index(
413		ctx->shader->input[input].interpolate,
414		ctx->shader->input[input].interpolate_location);
415	assert(i >= 0);
416	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
417}
418
419static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
420{
421	int i, r;
422	struct r600_bytecode_alu alu;
423	int gpr = 0, base_chan = 0;
424	int ij_index = ctx->shader->input[input].ij_index;
425
426	/* work out gpr and base_chan from index */
427	gpr = ij_index / 2;
428	base_chan = (2 * (ij_index % 2)) + 1;
429
430	for (i = 0; i < 8; i++) {
431		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
432
433		if (i < 4)
434			alu.op = ALU_OP2_INTERP_ZW;
435		else
436			alu.op = ALU_OP2_INTERP_XY;
437
438		if ((i > 1) && (i < 6)) {
439			alu.dst.sel = ctx->shader->input[input].gpr;
440			alu.dst.write = 1;
441		}
442
443		alu.dst.chan = i % 4;
444
445		alu.src[0].sel = gpr;
446		alu.src[0].chan = (base_chan - (i % 2));
447
448		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
449
450		alu.bank_swizzle_force = SQ_ALU_VEC_210;
451		if ((i % 4) == 3)
452			alu.last = 1;
453		r = r600_bytecode_add_alu(ctx->bc, &alu);
454		if (r)
455			return r;
456	}
457	return 0;
458}
459
460static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
461{
462	int i, r;
463	struct r600_bytecode_alu alu;
464
465	for (i = 0; i < 4; i++) {
466		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
467
468		alu.op = ALU_OP1_INTERP_LOAD_P0;
469
470		alu.dst.sel = ctx->shader->input[input].gpr;
471		alu.dst.write = 1;
472
473		alu.dst.chan = i;
474
475		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
476		alu.src[0].chan = i;
477
478		if (i == 3)
479			alu.last = 1;
480		r = r600_bytecode_add_alu(ctx->bc, &alu);
481		if (r)
482			return r;
483	}
484	return 0;
485}
486
487/*
488 * Special export handling in shaders
489 *
490 * shader export ARRAY_BASE for EXPORT_POS:
491 * 60 is position
492 * 61 is misc vector
493 * 62, 63 are clip distance vectors
494 *
495 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
496 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
497 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
498 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
499 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
500 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
501 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
502 * exclusive from render target index)
503 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
504 *
505 *
506 * shader export ARRAY_BASE for EXPORT_PIXEL:
507 * 0-7 CB targets
508 * 61 computed Z vector
509 *
510 * The use of the values exported in the computed Z vector are controlled
511 * by DB_SHADER_CONTROL:
512 * Z_EXPORT_ENABLE - Z as a float in RED
513 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
514 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
515 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
516 * DB_SOURCE_FORMAT - export control restrictions
517 *
518 */
519
520
521/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
522static int r600_spi_sid(struct r600_shader_io * io)
523{
524	int index, name = io->name;
525
526	/* These params are handled differently, they don't need
527	 * semantic indices, so we'll use 0 for them.
528	 */
529	if (name == TGSI_SEMANTIC_POSITION ||
530	    name == TGSI_SEMANTIC_PSIZE ||
531	    name == TGSI_SEMANTIC_EDGEFLAG ||
532	    name == TGSI_SEMANTIC_FACE ||
533	    name == TGSI_SEMANTIC_SAMPLEMASK)
534		index = 0;
535	else {
536		if (name == TGSI_SEMANTIC_GENERIC) {
537			/* For generic params simply use sid from tgsi */
538			index = io->sid;
539		} else {
540			/* For non-generic params - pack name and sid into 8 bits */
541			index = 0x80 | (name<<3) | (io->sid);
542		}
543
544		/* Make sure that all really used indices have nonzero value, so
545		 * we can just compare it to 0 later instead of comparing the name
546		 * with different values to detect special cases. */
547		index++;
548	}
549
550	return index;
551};
552
553/* turn input into interpolate on EG */
554static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
555{
556	int r = 0;
557
558	if (ctx->shader->input[index].spi_sid) {
559		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
560		if (ctx->shader->input[index].interpolate > 0) {
561			evergreen_interp_assign_ij_index(ctx, index);
562			if (!ctx->use_llvm)
563				r = evergreen_interp_alu(ctx, index);
564		} else {
565			if (!ctx->use_llvm)
566				r = evergreen_interp_flat(ctx, index);
567		}
568	}
569	return r;
570}
571
572static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
573{
574	struct r600_bytecode_alu alu;
575	int i, r;
576	int gpr_front = ctx->shader->input[front].gpr;
577	int gpr_back = ctx->shader->input[back].gpr;
578
579	for (i = 0; i < 4; i++) {
580		memset(&alu, 0, sizeof(alu));
581		alu.op = ALU_OP3_CNDGT;
582		alu.is_op3 = 1;
583		alu.dst.write = 1;
584		alu.dst.sel = gpr_front;
585		alu.src[0].sel = ctx->face_gpr;
586		alu.src[1].sel = gpr_front;
587		alu.src[2].sel = gpr_back;
588
589		alu.dst.chan = i;
590		alu.src[1].chan = i;
591		alu.src[2].chan = i;
592		alu.last = (i==3);
593
594		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
595			return r;
596	}
597
598	return 0;
599}
600
601static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
602{
603	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
604}
605
606static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
607{
608	int i;
609	i = ctx->shader->noutput++;
610	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
611	ctx->shader->output[i].sid = 0;
612	ctx->shader->output[i].gpr = 0;
613	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
614	ctx->shader->output[i].write_mask = 0x4;
615	ctx->shader->output[i].spi_sid = prim_id_sid;
616
617	return 0;
618}
619
620static int tgsi_declaration(struct r600_shader_ctx *ctx)
621{
622	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
623	int r, i, j, count = d->Range.Last - d->Range.First + 1;
624
625	switch (d->Declaration.File) {
626	case TGSI_FILE_INPUT:
627		for (j = 0; j < count; j++) {
628			i = ctx->shader->ninput + j;
629			assert(i < Elements(ctx->shader->input));
630			ctx->shader->input[i].name = d->Semantic.Name;
631			ctx->shader->input[i].sid = d->Semantic.Index + j;
632			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
633			ctx->shader->input[i].interpolate_location = d->Interp.Location;
634			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
635			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
636				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
637				switch (ctx->shader->input[i].name) {
638				case TGSI_SEMANTIC_FACE:
639					if (ctx->face_gpr != -1)
640						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
641					else
642						ctx->face_gpr = ctx->shader->input[i].gpr;
643					break;
644				case TGSI_SEMANTIC_COLOR:
645					ctx->colors_used++;
646					break;
647				case TGSI_SEMANTIC_POSITION:
648					ctx->fragcoord_input = i;
649					break;
650				case TGSI_SEMANTIC_PRIMID:
651					/* set this for now */
652					ctx->shader->gs_prim_id_input = true;
653					ctx->shader->ps_prim_id_input = i;
654					break;
655				}
656				if (ctx->bc->chip_class >= EVERGREEN) {
657					if ((r = evergreen_interp_input(ctx, i)))
658						return r;
659				}
660			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
661				/* FIXME probably skip inputs if they aren't passed in the ring */
662				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
663				ctx->next_ring_offset += 16;
664				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
665					ctx->shader->gs_prim_id_input = true;
666			}
667		}
668		ctx->shader->ninput += count;
669		break;
670	case TGSI_FILE_OUTPUT:
671		for (j = 0; j < count; j++) {
672			i = ctx->shader->noutput + j;
673			assert(i < Elements(ctx->shader->output));
674			ctx->shader->output[i].name = d->Semantic.Name;
675			ctx->shader->output[i].sid = d->Semantic.Index + j;
676			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
677			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
678			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
679			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
680			    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
681				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
682				switch (d->Semantic.Name) {
683				case TGSI_SEMANTIC_CLIPDIST:
684					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
685									((d->Semantic.Index + j) << 2);
686					break;
687				case TGSI_SEMANTIC_PSIZE:
688					ctx->shader->vs_out_misc_write = 1;
689					ctx->shader->vs_out_point_size = 1;
690					break;
691				case TGSI_SEMANTIC_EDGEFLAG:
692					ctx->shader->vs_out_misc_write = 1;
693					ctx->shader->vs_out_edgeflag = 1;
694					ctx->edgeflag_output = i;
695					break;
696				case TGSI_SEMANTIC_VIEWPORT_INDEX:
697					ctx->shader->vs_out_misc_write = 1;
698					ctx->shader->vs_out_viewport = 1;
699					break;
700				case TGSI_SEMANTIC_LAYER:
701					ctx->shader->vs_out_misc_write = 1;
702					ctx->shader->vs_out_layer = 1;
703					break;
704				case TGSI_SEMANTIC_CLIPVERTEX:
705					ctx->clip_vertex_write = TRUE;
706					ctx->cv_output = i;
707					break;
708				}
709				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
710					ctx->gs_out_ring_offset += 16;
711				}
712			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
713				switch (d->Semantic.Name) {
714				case TGSI_SEMANTIC_COLOR:
715					ctx->shader->nr_ps_max_color_exports++;
716					break;
717				}
718			}
719		}
720		ctx->shader->noutput += count;
721		break;
722	case TGSI_FILE_TEMPORARY:
723		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
724			if (d->Array.ArrayID) {
725				r600_add_gpr_array(ctx->shader,
726				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
727								   d->Range.First,
728				               d->Range.Last - d->Range.First + 1, 0x0F);
729			}
730		}
731		break;
732
733	case TGSI_FILE_CONSTANT:
734	case TGSI_FILE_SAMPLER:
735	case TGSI_FILE_SAMPLER_VIEW:
736	case TGSI_FILE_ADDRESS:
737		break;
738
739	case TGSI_FILE_SYSTEM_VALUE:
740		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
741			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
742			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
743			break; /* Already handled from allocate_system_value_inputs */
744		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
745			if (!ctx->native_integers) {
746				struct r600_bytecode_alu alu;
747				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
748
749				alu.op = ALU_OP1_INT_TO_FLT;
750				alu.src[0].sel = 0;
751				alu.src[0].chan = 3;
752
753				alu.dst.sel = 0;
754				alu.dst.chan = 3;
755				alu.dst.write = 1;
756				alu.last = 1;
757
758				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
759					return r;
760			}
761			break;
762		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
763			break;
764		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
765			break;
766	default:
767		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
768		return -EINVAL;
769	}
770	return 0;
771}
772
773static int r600_get_temp(struct r600_shader_ctx *ctx)
774{
775	return ctx->temp_reg + ctx->max_driver_temp_used++;
776}
777
778static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
779{
780	struct tgsi_parse_context parse;
781	struct {
782		boolean enabled;
783		int *reg;
784		unsigned name, alternate_name;
785	} inputs[2] = {
786		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
787
788		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
789	};
790	int i, k, num_regs = 0;
791
792	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
793		return 0;
794	}
795
796	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
797	while (!tgsi_parse_end_of_tokens(&parse)) {
798		tgsi_parse_token(&parse);
799
800		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
801			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
802			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
803				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
804				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
805			{
806				int interpolate, location, k;
807
808				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
809					location = TGSI_INTERPOLATE_LOC_CENTER;
810					inputs[1].enabled = true; /* needs SAMPLEID */
811				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
812					location = TGSI_INTERPOLATE_LOC_CENTER;
813					/* Needs sample positions, currently those are always available */
814				} else {
815					location = TGSI_INTERPOLATE_LOC_CENTROID;
816				}
817
818				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
819				k = eg_get_interpolator_index(interpolate, location);
820				ctx->eg_interpolators[k].enabled = true;
821			}
822		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
823			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
824			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
825				for (k = 0; k < Elements(inputs); k++) {
826					if (d->Semantic.Name == inputs[k].name ||
827						d->Semantic.Name == inputs[k].alternate_name) {
828						inputs[k].enabled = true;
829					}
830				}
831			}
832		}
833	}
834
835	tgsi_parse_free(&parse);
836
837	for (i = 0; i < Elements(inputs); i++) {
838		boolean enabled = inputs[i].enabled;
839		int *reg = inputs[i].reg;
840		unsigned name = inputs[i].name;
841
842		if (enabled) {
843			int gpr = gpr_offset + num_regs++;
844
845			// add to inputs, allocate a gpr
846			k = ctx->shader->ninput ++;
847			ctx->shader->input[k].name = name;
848			ctx->shader->input[k].sid = 0;
849			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
850			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
851			*reg = ctx->shader->input[k].gpr = gpr;
852		}
853	}
854
855	return gpr_offset + num_regs;
856}
857
858/*
859 * for evergreen we need to scan the shader to find the number of GPRs we need to
860 * reserve for interpolation and system values
861 *
862 * we need to know if we are going to emit
863 * any sample or centroid inputs
864 * if perspective and linear are required
865*/
866static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
867{
868	int i;
869	int num_baryc;
870	struct tgsi_parse_context parse;
871
872	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
873
874	for (i = 0; i < ctx->info.num_inputs; i++) {
875		int k;
876		/* skip position/face/mask/sampleid */
877		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
878		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
879		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
880		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
881			continue;
882
883		k = eg_get_interpolator_index(
884			ctx->info.input_interpolate[i],
885			ctx->info.input_interpolate_loc[i]);
886		if (k >= 0)
887			ctx->eg_interpolators[k].enabled = TRUE;
888	}
889
890	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
891		return 0;
892	}
893
894	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
895	while (!tgsi_parse_end_of_tokens(&parse)) {
896		tgsi_parse_token(&parse);
897
898		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
899			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
900			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
901				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
902				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
903			{
904				int interpolate, location, k;
905
906				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
907					location = TGSI_INTERPOLATE_LOC_CENTER;
908				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
909					location = TGSI_INTERPOLATE_LOC_CENTER;
910				} else {
911					location = TGSI_INTERPOLATE_LOC_CENTROID;
912				}
913
914				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
915				k = eg_get_interpolator_index(interpolate, location);
916				ctx->eg_interpolators[k].enabled = true;
917			}
918		}
919	}
920
921	tgsi_parse_free(&parse);
922
923	/* assign gpr to each interpolator according to priority */
924	num_baryc = 0;
925	for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
926		if (ctx->eg_interpolators[i].enabled) {
927			ctx->eg_interpolators[i].ij_index = num_baryc;
928			num_baryc ++;
929		}
930	}
931
932	/* XXX PULL MODEL and LINE STIPPLE */
933
934	num_baryc = (num_baryc + 1) >> 1;
935	return allocate_system_value_inputs(ctx, num_baryc);
936}
937
938/* sample_id_sel == NULL means fetch for current sample */
939static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
940{
941	struct r600_bytecode_vtx vtx;
942	int r, t1;
943
944	assert(ctx->fixed_pt_position_gpr != -1);
945
946	t1 = r600_get_temp(ctx);
947
948	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
949	vtx.op = FETCH_OP_VFETCH;
950	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
951	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
952	if (sample_id == NULL) {
953		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
954		vtx.src_sel_x = 3;
955	}
956	else {
957		struct r600_bytecode_alu alu;
958
959		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
960		alu.op = ALU_OP1_MOV;
961		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
962		alu.dst.sel = t1;
963		alu.dst.write = 1;
964		alu.last = 1;
965		r = r600_bytecode_add_alu(ctx->bc, &alu);
966		if (r)
967			return r;
968
969		vtx.src_gpr = t1;
970		vtx.src_sel_x = 0;
971	}
972	vtx.mega_fetch_count = 16;
973	vtx.dst_gpr = t1;
974	vtx.dst_sel_x = 0;
975	vtx.dst_sel_y = 1;
976	vtx.dst_sel_z = 2;
977	vtx.dst_sel_w = 3;
978	vtx.data_format = FMT_32_32_32_32_FLOAT;
979	vtx.num_format_all = 2;
980	vtx.format_comp_all = 1;
981	vtx.use_const_fields = 0;
982	vtx.offset = 1; // first element is size of buffer
983	vtx.endian = r600_endian_swap(32);
984	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
985
986	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
987	if (r)
988		return r;
989
990	return t1;
991}
992
993static void tgsi_src(struct r600_shader_ctx *ctx,
994		     const struct tgsi_full_src_register *tgsi_src,
995		     struct r600_shader_src *r600_src)
996{
997	memset(r600_src, 0, sizeof(*r600_src));
998	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
999	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1000	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1001	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1002	r600_src->neg = tgsi_src->Register.Negate;
1003	r600_src->abs = tgsi_src->Register.Absolute;
1004
1005	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1006		int index;
1007		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1008			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1009			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1010
1011			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1012			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1013			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1014				return;
1015		}
1016		index = tgsi_src->Register.Index;
1017		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1018		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1019	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1020		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1021			r600_src->swizzle[0] = 2; // Z value
1022			r600_src->swizzle[1] = 2;
1023			r600_src->swizzle[2] = 2;
1024			r600_src->swizzle[3] = 2;
1025			r600_src->sel = ctx->face_gpr;
1026		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1027			r600_src->swizzle[0] = 3; // W value
1028			r600_src->swizzle[1] = 3;
1029			r600_src->swizzle[2] = 3;
1030			r600_src->swizzle[3] = 3;
1031			r600_src->sel = ctx->fixed_pt_position_gpr;
1032		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1033			r600_src->swizzle[0] = 0;
1034			r600_src->swizzle[1] = 1;
1035			r600_src->swizzle[2] = 4;
1036			r600_src->swizzle[3] = 4;
1037			r600_src->sel = load_sample_position(ctx, NULL, -1);
1038		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1039			r600_src->swizzle[0] = 3;
1040			r600_src->swizzle[1] = 3;
1041			r600_src->swizzle[2] = 3;
1042			r600_src->swizzle[3] = 3;
1043			r600_src->sel = 0;
1044		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1045			r600_src->swizzle[0] = 0;
1046			r600_src->swizzle[1] = 0;
1047			r600_src->swizzle[2] = 0;
1048			r600_src->swizzle[3] = 0;
1049			r600_src->sel = 0;
1050		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1051			r600_src->swizzle[0] = 3;
1052			r600_src->swizzle[1] = 3;
1053			r600_src->swizzle[2] = 3;
1054			r600_src->swizzle[3] = 3;
1055			r600_src->sel = 1;
1056		}
1057	} else {
1058		if (tgsi_src->Register.Indirect)
1059			r600_src->rel = V_SQ_REL_RELATIVE;
1060		r600_src->sel = tgsi_src->Register.Index;
1061		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1062	}
1063	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1064		if (tgsi_src->Register.Dimension) {
1065			r600_src->kc_bank = tgsi_src->Dimension.Index;
1066			if (tgsi_src->Dimension.Indirect) {
1067				r600_src->kc_rel = 1;
1068			}
1069		}
1070	}
1071}
1072
1073static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1074                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1075                                unsigned int dst_reg)
1076{
1077	struct r600_bytecode_vtx vtx;
1078	unsigned int ar_reg;
1079	int r;
1080
1081	if (offset) {
1082		struct r600_bytecode_alu alu;
1083
1084		memset(&alu, 0, sizeof(alu));
1085
1086		alu.op = ALU_OP2_ADD_INT;
1087		alu.src[0].sel = ctx->bc->ar_reg;
1088		alu.src[0].chan = ar_chan;
1089
1090		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1091		alu.src[1].value = offset;
1092
1093		alu.dst.sel = dst_reg;
1094		alu.dst.chan = ar_chan;
1095		alu.dst.write = 1;
1096		alu.last = 1;
1097
1098		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1099			return r;
1100
1101		ar_reg = dst_reg;
1102	} else {
1103		ar_reg = ctx->bc->ar_reg;
1104	}
1105
1106	memset(&vtx, 0, sizeof(vtx));
1107	vtx.buffer_id = cb_idx;
1108	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1109	vtx.src_gpr = ar_reg;
1110	vtx.src_sel_x = ar_chan;
1111	vtx.mega_fetch_count = 16;
1112	vtx.dst_gpr = dst_reg;
1113	vtx.dst_sel_x = 0;		/* SEL_X */
1114	vtx.dst_sel_y = 1;		/* SEL_Y */
1115	vtx.dst_sel_z = 2;		/* SEL_Z */
1116	vtx.dst_sel_w = 3;		/* SEL_W */
1117	vtx.data_format = FMT_32_32_32_32_FLOAT;
1118	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1119	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1120	vtx.endian = r600_endian_swap(32);
1121	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1122
1123	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1124		return r;
1125
1126	return 0;
1127}
1128
1129static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1130{
1131	struct r600_bytecode_vtx vtx;
1132	int r;
1133	unsigned index = src->Register.Index;
1134	unsigned vtx_id = src->Dimension.Index;
1135	int offset_reg = vtx_id / 3;
1136	int offset_chan = vtx_id % 3;
1137
1138	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1139	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1140
1141	if (offset_reg == 0 && offset_chan == 2)
1142		offset_chan = 3;
1143
1144	if (src->Dimension.Indirect) {
1145		int treg[3];
1146		int t2;
1147		struct r600_bytecode_alu alu;
1148		int r, i;
1149
1150		/* you have got to be shitting me -
1151		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1152		   at least this is what fglrx seems to do. */
1153		for (i = 0; i < 3; i++) {
1154			treg[i] = r600_get_temp(ctx);
1155		}
1156		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1157
1158		t2 = r600_get_temp(ctx);
1159		for (i = 0; i < 3; i++) {
1160			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1161			alu.op = ALU_OP1_MOV;
1162			alu.src[0].sel = 0;
1163			alu.src[0].chan = i == 2 ? 3 : i;
1164			alu.dst.sel = treg[i];
1165			alu.dst.chan = 0;
1166			alu.dst.write = 1;
1167			alu.last = 1;
1168			r = r600_bytecode_add_alu(ctx->bc, &alu);
1169			if (r)
1170				return r;
1171		}
1172		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1173		alu.op = ALU_OP1_MOV;
1174		alu.src[0].sel = treg[0];
1175		alu.src[0].rel = 1;
1176		alu.dst.sel = t2;
1177		alu.dst.write = 1;
1178		alu.last = 1;
1179		r = r600_bytecode_add_alu(ctx->bc, &alu);
1180		if (r)
1181			return r;
1182		offset_reg = t2;
1183	}
1184
1185
1186	memset(&vtx, 0, sizeof(vtx));
1187	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1188	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1189	vtx.src_gpr = offset_reg;
1190	vtx.src_sel_x = offset_chan;
1191	vtx.offset = index * 16; /*bytes*/
1192	vtx.mega_fetch_count = 16;
1193	vtx.dst_gpr = dst_reg;
1194	vtx.dst_sel_x = 0;		/* SEL_X */
1195	vtx.dst_sel_y = 1;		/* SEL_Y */
1196	vtx.dst_sel_z = 2;		/* SEL_Z */
1197	vtx.dst_sel_w = 3;		/* SEL_W */
1198	if (ctx->bc->chip_class >= EVERGREEN) {
1199		vtx.use_const_fields = 1;
1200	} else {
1201		vtx.data_format = FMT_32_32_32_32_FLOAT;
1202	}
1203
1204	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1205		return r;
1206
1207	return 0;
1208}
1209
1210static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1211{
1212	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1213	int i;
1214
1215	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1216		struct tgsi_full_src_register *src = &inst->Src[i];
1217
1218		if (src->Register.File == TGSI_FILE_INPUT) {
1219			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1220				/* primitive id is in R0.z */
1221				ctx->src[i].sel = 0;
1222				ctx->src[i].swizzle[0] = 2;
1223			}
1224		}
1225		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1226			int treg = r600_get_temp(ctx);
1227
1228			fetch_gs_input(ctx, src, treg);
1229			ctx->src[i].sel = treg;
1230		}
1231	}
1232	return 0;
1233}
1234
1235static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1236{
1237	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1238	struct r600_bytecode_alu alu;
1239	int i, j, k, nconst, r;
1240
1241	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1242		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1243			nconst++;
1244		}
1245		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1246	}
1247	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1248		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1249			continue;
1250		}
1251
1252		if (ctx->src[i].rel) {
1253			int chan = inst->Src[i].Indirect.Swizzle;
1254			int treg = r600_get_temp(ctx);
1255			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1256				return r;
1257
1258			ctx->src[i].kc_bank = 0;
1259			ctx->src[i].kc_rel = 0;
1260			ctx->src[i].sel = treg;
1261			ctx->src[i].rel = 0;
1262			j--;
1263		} else if (j > 0) {
1264			int treg = r600_get_temp(ctx);
1265			for (k = 0; k < 4; k++) {
1266				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1267				alu.op = ALU_OP1_MOV;
1268				alu.src[0].sel = ctx->src[i].sel;
1269				alu.src[0].chan = k;
1270				alu.src[0].rel = ctx->src[i].rel;
1271				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1272				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1273				alu.dst.sel = treg;
1274				alu.dst.chan = k;
1275				alu.dst.write = 1;
1276				if (k == 3)
1277					alu.last = 1;
1278				r = r600_bytecode_add_alu(ctx->bc, &alu);
1279				if (r)
1280					return r;
1281			}
1282			ctx->src[i].sel = treg;
1283			ctx->src[i].rel =0;
1284			j--;
1285		}
1286	}
1287	return 0;
1288}
1289
1290/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1291static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1292{
1293	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1294	struct r600_bytecode_alu alu;
1295	int i, j, k, nliteral, r;
1296
1297	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1298		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1299			nliteral++;
1300		}
1301	}
1302	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1303		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1304			int treg = r600_get_temp(ctx);
1305			for (k = 0; k < 4; k++) {
1306				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1307				alu.op = ALU_OP1_MOV;
1308				alu.src[0].sel = ctx->src[i].sel;
1309				alu.src[0].chan = k;
1310				alu.src[0].value = ctx->src[i].value[k];
1311				alu.dst.sel = treg;
1312				alu.dst.chan = k;
1313				alu.dst.write = 1;
1314				if (k == 3)
1315					alu.last = 1;
1316				r = r600_bytecode_add_alu(ctx->bc, &alu);
1317				if (r)
1318					return r;
1319			}
1320			ctx->src[i].sel = treg;
1321			j--;
1322		}
1323	}
1324	return 0;
1325}
1326
1327static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1328{
1329	int i, r, count = ctx->shader->ninput;
1330
1331	for (i = 0; i < count; i++) {
1332		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1333			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1334			if (r)
1335				return r;
1336		}
1337	}
1338	return 0;
1339}
1340
1341static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1342						  int stream, unsigned *stream_item_size)
1343{
1344	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1345	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1346	int i, j, r;
1347
1348	/* Sanity checking. */
1349	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1350		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1351		r = -EINVAL;
1352		goto out_err;
1353	}
1354	for (i = 0; i < so->num_outputs; i++) {
1355		if (so->output[i].output_buffer >= 4) {
1356			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1357				 so->output[i].output_buffer);
1358			r = -EINVAL;
1359			goto out_err;
1360		}
1361	}
1362
1363	/* Initialize locations where the outputs are stored. */
1364	for (i = 0; i < so->num_outputs; i++) {
1365
1366		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1367		start_comp[i] = so->output[i].start_component;
1368		/* Lower outputs with dst_offset < start_component.
1369		 *
1370		 * We can only output 4D vectors with a write mask, e.g. we can
1371		 * only output the W component at offset 3, etc. If we want
1372		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1373		 * to move it to X and output X. */
1374		if (so->output[i].dst_offset < so->output[i].start_component) {
1375			unsigned tmp = r600_get_temp(ctx);
1376
1377			for (j = 0; j < so->output[i].num_components; j++) {
1378				struct r600_bytecode_alu alu;
1379				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1380				alu.op = ALU_OP1_MOV;
1381				alu.src[0].sel = so_gpr[i];
1382				alu.src[0].chan = so->output[i].start_component + j;
1383
1384				alu.dst.sel = tmp;
1385				alu.dst.chan = j;
1386				alu.dst.write = 1;
1387				if (j == so->output[i].num_components - 1)
1388					alu.last = 1;
1389				r = r600_bytecode_add_alu(ctx->bc, &alu);
1390				if (r)
1391					return r;
1392			}
1393			start_comp[i] = 0;
1394			so_gpr[i] = tmp;
1395		}
1396	}
1397
1398	/* Write outputs to buffers. */
1399	for (i = 0; i < so->num_outputs; i++) {
1400		struct r600_bytecode_output output;
1401
1402		if (stream != -1 && stream != so->output[i].output_buffer)
1403			continue;
1404
1405		memset(&output, 0, sizeof(struct r600_bytecode_output));
1406		output.gpr = so_gpr[i];
1407		output.elem_size = so->output[i].num_components - 1;
1408		if (output.elem_size == 2)
1409			output.elem_size = 3; // 3 not supported, write 4 with junk at end
1410		output.array_base = so->output[i].dst_offset - start_comp[i];
1411		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1412		output.burst_count = 1;
1413		/* array_size is an upper limit for the burst_count
1414		 * with MEM_STREAM instructions */
1415		output.array_size = 0xFFF;
1416		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1417
1418		if (ctx->bc->chip_class >= EVERGREEN) {
1419			switch (so->output[i].output_buffer) {
1420			case 0:
1421				output.op = CF_OP_MEM_STREAM0_BUF0;
1422				break;
1423			case 1:
1424				output.op = CF_OP_MEM_STREAM0_BUF1;
1425				break;
1426			case 2:
1427				output.op = CF_OP_MEM_STREAM0_BUF2;
1428				break;
1429			case 3:
1430				output.op = CF_OP_MEM_STREAM0_BUF3;
1431				break;
1432			}
1433			output.op += so->output[i].stream * 4;
1434			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1435			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1436		} else {
1437			switch (so->output[i].output_buffer) {
1438			case 0:
1439				output.op = CF_OP_MEM_STREAM0;
1440				break;
1441			case 1:
1442				output.op = CF_OP_MEM_STREAM1;
1443				break;
1444			case 2:
1445				output.op = CF_OP_MEM_STREAM2;
1446				break;
1447			case 3:
1448				output.op = CF_OP_MEM_STREAM3;
1449					break;
1450			}
1451			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1452		}
1453		r = r600_bytecode_add_output(ctx->bc, &output);
1454		if (r)
1455			goto out_err;
1456	}
1457	return 0;
1458out_err:
1459	return r;
1460}
1461
1462static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1463{
1464	struct r600_bytecode_alu alu;
1465	unsigned reg;
1466
1467	if (!ctx->shader->vs_out_edgeflag)
1468		return;
1469
1470	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1471
1472	/* clamp(x, 0, 1) */
1473	memset(&alu, 0, sizeof(alu));
1474	alu.op = ALU_OP1_MOV;
1475	alu.src[0].sel = reg;
1476	alu.dst.sel = reg;
1477	alu.dst.write = 1;
1478	alu.dst.clamp = 1;
1479	alu.last = 1;
1480	r600_bytecode_add_alu(ctx->bc, &alu);
1481
1482	memset(&alu, 0, sizeof(alu));
1483	alu.op = ALU_OP1_FLT_TO_INT;
1484	alu.src[0].sel = reg;
1485	alu.dst.sel = reg;
1486	alu.dst.write = 1;
1487	alu.last = 1;
1488	r600_bytecode_add_alu(ctx->bc, &alu);
1489}
1490
1491static int generate_gs_copy_shader(struct r600_context *rctx,
1492				   struct r600_pipe_shader *gs,
1493				   struct pipe_stream_output_info *so)
1494{
1495	struct r600_shader_ctx ctx = {};
1496	struct r600_shader *gs_shader = &gs->shader;
1497	struct r600_pipe_shader *cshader;
1498	int ocnt = gs_shader->noutput;
1499	struct r600_bytecode_alu alu;
1500	struct r600_bytecode_vtx vtx;
1501	struct r600_bytecode_output output;
1502	struct r600_bytecode_cf *cf_jump, *cf_pop,
1503		*last_exp_pos = NULL, *last_exp_param = NULL;
1504	int i, j, next_clip_pos = 61, next_param = 0;
1505	int ring;
1506
1507	cshader = calloc(1, sizeof(struct r600_pipe_shader));
1508	if (!cshader)
1509		return 0;
1510
1511	memcpy(cshader->shader.output, gs_shader->output, ocnt *
1512	       sizeof(struct r600_shader_io));
1513
1514	cshader->shader.noutput = ocnt;
1515
1516	ctx.shader = &cshader->shader;
1517	ctx.bc = &ctx.shader->bc;
1518	ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1519
1520	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1521			   rctx->screen->has_compressed_msaa_texturing);
1522
1523	ctx.bc->isa = rctx->isa;
1524
1525	cf_jump = NULL;
1526	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1527
1528	/* R0.x = R0.x & 0x3fffffff */
1529	memset(&alu, 0, sizeof(alu));
1530	alu.op = ALU_OP2_AND_INT;
1531	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1532	alu.src[1].value = 0x3fffffff;
1533	alu.dst.write = 1;
1534	r600_bytecode_add_alu(ctx.bc, &alu);
1535
1536	/* R0.y = R0.x >> 30 */
1537	memset(&alu, 0, sizeof(alu));
1538	alu.op = ALU_OP2_LSHR_INT;
1539	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1540	alu.src[1].value = 0x1e;
1541	alu.dst.chan = 1;
1542	alu.dst.write = 1;
1543	alu.last = 1;
1544	r600_bytecode_add_alu(ctx.bc, &alu);
1545
1546	/* fetch vertex data from GSVS ring */
1547	for (i = 0; i < ocnt; ++i) {
1548		struct r600_shader_io *out = &ctx.shader->output[i];
1549
1550		out->gpr = i + 1;
1551		out->ring_offset = i * 16;
1552
1553		memset(&vtx, 0, sizeof(vtx));
1554		vtx.op = FETCH_OP_VFETCH;
1555		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1556		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1557		vtx.mega_fetch_count = 16;
1558		vtx.offset = out->ring_offset;
1559		vtx.dst_gpr = out->gpr;
1560		vtx.src_gpr = 0;
1561		vtx.dst_sel_x = 0;
1562		vtx.dst_sel_y = 1;
1563		vtx.dst_sel_z = 2;
1564		vtx.dst_sel_w = 3;
1565		if (rctx->b.chip_class >= EVERGREEN) {
1566			vtx.use_const_fields = 1;
1567		} else {
1568			vtx.data_format = FMT_32_32_32_32_FLOAT;
1569		}
1570
1571		r600_bytecode_add_vtx(ctx.bc, &vtx);
1572	}
1573	ctx.temp_reg = i + 1;
1574	for (ring = 3; ring >= 0; --ring) {
1575		bool enabled = false;
1576		for (i = 0; i < so->num_outputs; i++) {
1577			if (so->output[i].stream == ring) {
1578				enabled = true;
1579				break;
1580			}
1581		}
1582		if (ring != 0 && !enabled) {
1583			cshader->shader.ring_item_sizes[ring] = 0;
1584			continue;
1585		}
1586
1587		if (cf_jump) {
1588			// Patch up jump label
1589			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1590			cf_pop = ctx.bc->cf_last;
1591
1592			cf_jump->cf_addr = cf_pop->id + 2;
1593			cf_jump->pop_count = 1;
1594			cf_pop->cf_addr = cf_pop->id + 2;
1595			cf_pop->pop_count = 1;
1596		}
1597
1598		/* PRED_SETE_INT __, R0.y, ring */
1599		memset(&alu, 0, sizeof(alu));
1600		alu.op = ALU_OP2_PRED_SETE_INT;
1601		alu.src[0].chan = 1;
1602		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1603		alu.src[1].value = ring;
1604		alu.execute_mask = 1;
1605		alu.update_pred = 1;
1606		alu.last = 1;
1607		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1608
1609		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1610		cf_jump = ctx.bc->cf_last;
1611
1612		if (enabled)
1613			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1614		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1615	}
1616
1617	/* bc adds nops - copy it */
1618	if (ctx.bc->chip_class == R600) {
1619		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1620		alu.op = ALU_OP0_NOP;
1621		alu.last = 1;
1622		r600_bytecode_add_alu(ctx.bc, &alu);
1623
1624		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1625	}
1626
1627	/* export vertex data */
1628	/* XXX factor out common code with r600_shader_from_tgsi ? */
1629	for (i = 0; i < ocnt; ++i) {
1630		struct r600_shader_io *out = &ctx.shader->output[i];
1631		bool instream0 = true;
1632		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1633			continue;
1634
1635		for (j = 0; j < so->num_outputs; j++) {
1636			if (so->output[j].register_index == i) {
1637				if (so->output[j].stream == 0)
1638					break;
1639				if (so->output[j].stream > 0)
1640					instream0 = false;
1641			}
1642		}
1643		if (!instream0)
1644			continue;
1645		memset(&output, 0, sizeof(output));
1646		output.gpr = out->gpr;
1647		output.elem_size = 3;
1648		output.swizzle_x = 0;
1649		output.swizzle_y = 1;
1650		output.swizzle_z = 2;
1651		output.swizzle_w = 3;
1652		output.burst_count = 1;
1653		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1654		output.op = CF_OP_EXPORT;
1655		switch (out->name) {
1656		case TGSI_SEMANTIC_POSITION:
1657			output.array_base = 60;
1658			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1659			break;
1660
1661		case TGSI_SEMANTIC_PSIZE:
1662			output.array_base = 61;
1663			if (next_clip_pos == 61)
1664				next_clip_pos = 62;
1665			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1666			output.swizzle_y = 7;
1667			output.swizzle_z = 7;
1668			output.swizzle_w = 7;
1669			ctx.shader->vs_out_misc_write = 1;
1670			ctx.shader->vs_out_point_size = 1;
1671			break;
1672		case TGSI_SEMANTIC_LAYER:
1673			if (out->spi_sid) {
1674				/* duplicate it as PARAM to pass to the pixel shader */
1675				output.array_base = next_param++;
1676				r600_bytecode_add_output(ctx.bc, &output);
1677				last_exp_param = ctx.bc->cf_last;
1678			}
1679			output.array_base = 61;
1680			if (next_clip_pos == 61)
1681				next_clip_pos = 62;
1682			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1683			output.swizzle_x = 7;
1684			output.swizzle_y = 7;
1685			output.swizzle_z = 0;
1686			output.swizzle_w = 7;
1687			ctx.shader->vs_out_misc_write = 1;
1688			ctx.shader->vs_out_layer = 1;
1689			break;
1690		case TGSI_SEMANTIC_VIEWPORT_INDEX:
1691			if (out->spi_sid) {
1692				/* duplicate it as PARAM to pass to the pixel shader */
1693				output.array_base = next_param++;
1694				r600_bytecode_add_output(ctx.bc, &output);
1695				last_exp_param = ctx.bc->cf_last;
1696			}
1697			output.array_base = 61;
1698			if (next_clip_pos == 61)
1699				next_clip_pos = 62;
1700			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1701			ctx.shader->vs_out_misc_write = 1;
1702			ctx.shader->vs_out_viewport = 1;
1703			output.swizzle_x = 7;
1704			output.swizzle_y = 7;
1705			output.swizzle_z = 7;
1706			output.swizzle_w = 0;
1707			break;
1708		case TGSI_SEMANTIC_CLIPDIST:
1709			/* spi_sid is 0 for clipdistance outputs that were generated
1710			 * for clipvertex - we don't need to pass them to PS */
1711			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1712			if (out->spi_sid) {
1713				/* duplicate it as PARAM to pass to the pixel shader */
1714				output.array_base = next_param++;
1715				r600_bytecode_add_output(ctx.bc, &output);
1716				last_exp_param = ctx.bc->cf_last;
1717			}
1718			output.array_base = next_clip_pos++;
1719			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1720			break;
1721		case TGSI_SEMANTIC_FOG:
1722			output.swizzle_y = 4; /* 0 */
1723			output.swizzle_z = 4; /* 0 */
1724			output.swizzle_w = 5; /* 1 */
1725			break;
1726		default:
1727			output.array_base = next_param++;
1728			break;
1729		}
1730		r600_bytecode_add_output(ctx.bc, &output);
1731		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1732			last_exp_param = ctx.bc->cf_last;
1733		else
1734			last_exp_pos = ctx.bc->cf_last;
1735	}
1736
1737	if (!last_exp_pos) {
1738		memset(&output, 0, sizeof(output));
1739		output.gpr = 0;
1740		output.elem_size = 3;
1741		output.swizzle_x = 7;
1742		output.swizzle_y = 7;
1743		output.swizzle_z = 7;
1744		output.swizzle_w = 7;
1745		output.burst_count = 1;
1746		output.type = 2;
1747		output.op = CF_OP_EXPORT;
1748		output.array_base = 60;
1749		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1750		r600_bytecode_add_output(ctx.bc, &output);
1751		last_exp_pos = ctx.bc->cf_last;
1752	}
1753
1754	if (!last_exp_param) {
1755		memset(&output, 0, sizeof(output));
1756		output.gpr = 0;
1757		output.elem_size = 3;
1758		output.swizzle_x = 7;
1759		output.swizzle_y = 7;
1760		output.swizzle_z = 7;
1761		output.swizzle_w = 7;
1762		output.burst_count = 1;
1763		output.type = 2;
1764		output.op = CF_OP_EXPORT;
1765		output.array_base = next_param++;
1766		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1767		r600_bytecode_add_output(ctx.bc, &output);
1768		last_exp_param = ctx.bc->cf_last;
1769	}
1770
1771	last_exp_pos->op = CF_OP_EXPORT_DONE;
1772	last_exp_param->op = CF_OP_EXPORT_DONE;
1773
1774	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1775	cf_pop = ctx.bc->cf_last;
1776
1777	cf_jump->cf_addr = cf_pop->id + 2;
1778	cf_jump->pop_count = 1;
1779	cf_pop->cf_addr = cf_pop->id + 2;
1780	cf_pop->pop_count = 1;
1781
1782	if (ctx.bc->chip_class == CAYMAN)
1783		cm_bytecode_add_cf_end(ctx.bc);
1784	else {
1785		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1786		ctx.bc->cf_last->end_of_program = 1;
1787	}
1788
1789	gs->gs_copy_shader = cshader;
1790	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1791
1792	ctx.bc->nstack = 1;
1793
1794	return r600_bytecode_build(ctx.bc);
1795}
1796
1797static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
1798{
1799	if (ind) {
1800		struct r600_bytecode_alu alu;
1801		int r;
1802
1803		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1804		alu.op = ALU_OP2_ADD_INT;
1805		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
1806		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1807		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1808		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
1809		alu.dst.write = 1;
1810		alu.last = 1;
1811		r = r600_bytecode_add_alu(ctx->bc, &alu);
1812		if (r)
1813			return r;
1814	}
1815	return 0;
1816}
1817
1818static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1819{
1820	struct r600_bytecode_output output;
1821	int i, k, ring_offset;
1822	int effective_stream = stream == -1 ? 0 : stream;
1823	int idx = 0;
1824
1825	for (i = 0; i < ctx->shader->noutput; i++) {
1826		if (ctx->gs_for_vs) {
1827			/* for ES we need to lookup corresponding ring offset expected by GS
1828			 * (map this output to GS input by name and sid) */
1829			/* FIXME precompute offsets */
1830			ring_offset = -1;
1831			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1832				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1833				struct r600_shader_io *out = &ctx->shader->output[i];
1834				if (in->name == out->name && in->sid == out->sid)
1835					ring_offset = in->ring_offset;
1836			}
1837
1838			if (ring_offset == -1)
1839				continue;
1840		} else {
1841			ring_offset = idx * 16;
1842			idx++;
1843		}
1844
1845		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
1846			continue;
1847		/* next_ring_offset after parsing input decls contains total size of
1848		 * single vertex data, gs_next_vertex - current vertex index */
1849		if (!ind)
1850			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1851
1852		memset(&output, 0, sizeof(struct r600_bytecode_output));
1853		output.gpr = ctx->shader->output[i].gpr;
1854		output.elem_size = 3;
1855		output.comp_mask = 0xF;
1856		output.burst_count = 1;
1857
1858		if (ind)
1859			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1860		else
1861			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1862
1863		switch (stream) {
1864		default:
1865		case 0:
1866			output.op = CF_OP_MEM_RING; break;
1867		case 1:
1868			output.op = CF_OP_MEM_RING1; break;
1869		case 2:
1870			output.op = CF_OP_MEM_RING2; break;
1871		case 3:
1872			output.op = CF_OP_MEM_RING3; break;
1873		}
1874
1875		if (ind) {
1876			output.array_base = ring_offset >> 2; /* in dwords */
1877			output.array_size = 0xfff;
1878			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
1879		} else
1880			output.array_base = ring_offset >> 2; /* in dwords */
1881		r600_bytecode_add_output(ctx->bc, &output);
1882	}
1883
1884	++ctx->gs_next_vertex;
1885	return 0;
1886}
1887
1888static int r600_shader_from_tgsi(struct r600_context *rctx,
1889				 struct r600_pipe_shader *pipeshader,
1890				 union r600_shader_key key)
1891{
1892	struct r600_screen *rscreen = rctx->screen;
1893	struct r600_shader *shader = &pipeshader->shader;
1894	struct tgsi_token *tokens = pipeshader->selector->tokens;
1895	struct pipe_stream_output_info so = pipeshader->selector->so;
1896	struct tgsi_full_immediate *immediate;
1897	struct r600_shader_ctx ctx;
1898	struct r600_bytecode_output output[32];
1899	unsigned output_done, noutput;
1900	unsigned opcode;
1901	int i, j, k, r = 0;
1902	int next_param_base = 0, next_clip_base;
1903	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
1904	/* Declarations used by llvm code */
1905	bool use_llvm = false;
1906	bool indirect_gprs;
1907	bool ring_outputs = false;
1908	bool pos_emitted = false;
1909
1910#ifdef R600_USE_LLVM
1911	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1912#endif
1913	ctx.bc = &shader->bc;
1914	ctx.shader = shader;
1915	ctx.native_integers = true;
1916
1917
1918	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1919			   rscreen->has_compressed_msaa_texturing);
1920	ctx.tokens = tokens;
1921	tgsi_scan_shader(tokens, &ctx.info);
1922	shader->indirect_files = ctx.info.indirect_files;
1923
1924	shader->uses_doubles = ctx.info.uses_doubles;
1925
1926	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
1927	tgsi_parse_init(&ctx.parse, tokens);
1928	ctx.type = ctx.info.processor;
1929	shader->processor_type = ctx.type;
1930	ctx.bc->type = shader->processor_type;
1931
1932	switch (ctx.type) {
1933	case TGSI_PROCESSOR_VERTEX:
1934		shader->vs_as_gs_a = key.vs.as_gs_a;
1935		shader->vs_as_es = key.vs.as_es;
1936		if (shader->vs_as_es)
1937			ring_outputs = true;
1938		break;
1939	case TGSI_PROCESSOR_GEOMETRY:
1940		ring_outputs = true;
1941		break;
1942	case TGSI_PROCESSOR_FRAGMENT:
1943		shader->two_side = key.ps.color_two_side;
1944		break;
1945	default:
1946		break;
1947	}
1948
1949	if (shader->vs_as_es) {
1950		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1951	} else {
1952		ctx.gs_for_vs = NULL;
1953	}
1954
1955	ctx.next_ring_offset = 0;
1956	ctx.gs_out_ring_offset = 0;
1957	ctx.gs_next_vertex = 0;
1958	ctx.gs_stream_output_info = &so;
1959
1960	ctx.face_gpr = -1;
1961	ctx.fixed_pt_position_gpr = -1;
1962	ctx.fragcoord_input = -1;
1963	ctx.colors_used = 0;
1964	ctx.clip_vertex_write = 0;
1965
1966	shader->nr_ps_color_exports = 0;
1967	shader->nr_ps_max_color_exports = 0;
1968
1969
1970	/* register allocations */
1971	/* Values [0,127] correspond to GPR[0..127].
1972	 * Values [128,159] correspond to constant buffer bank 0
1973	 * Values [160,191] correspond to constant buffer bank 1
1974	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1975	 * Values [256,287] correspond to constant buffer bank 2 (EG)
1976	 * Values [288,319] correspond to constant buffer bank 3 (EG)
1977	 * Other special values are shown in the list below.
1978	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1979	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1980	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1981	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1982	 * 248	SQ_ALU_SRC_0: special constant 0.0.
1983	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
1984	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
1985	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1986	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
1987	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
1988	 * 254	SQ_ALU_SRC_PV: previous vector result.
1989	 * 255	SQ_ALU_SRC_PS: previous scalar result.
1990	 */
1991	for (i = 0; i < TGSI_FILE_COUNT; i++) {
1992		ctx.file_offset[i] = 0;
1993	}
1994
1995#ifdef R600_USE_LLVM
1996	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1997		fprintf(stderr, "Warning: R600 LLVM backend does not support "
1998				"indirect adressing.  Falling back to TGSI "
1999				"backend.\n");
2000		use_llvm = 0;
2001	}
2002#endif
2003	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2004		ctx.file_offset[TGSI_FILE_INPUT] = 1;
2005		if (!use_llvm) {
2006			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
2007		}
2008	}
2009	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2010		if (ctx.bc->chip_class >= EVERGREEN)
2011			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
2012		else
2013			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
2014	}
2015	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2016		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
2017		ctx.file_offset[TGSI_FILE_INPUT] = 2;
2018	}
2019	ctx.use_llvm = use_llvm;
2020
2021	if (use_llvm) {
2022		ctx.file_offset[TGSI_FILE_OUTPUT] =
2023			ctx.file_offset[TGSI_FILE_INPUT];
2024	} else {
2025	   ctx.file_offset[TGSI_FILE_OUTPUT] =
2026			ctx.file_offset[TGSI_FILE_INPUT] +
2027			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2028	}
2029	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2030						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2031
2032	/* Outside the GPR range. This will be translated to one of the
2033	 * kcache banks later. */
2034	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2035
2036	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2037	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2038			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2039	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2040	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2041
2042	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2043		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2044		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2045		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2046		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2047		ctx.temp_reg = ctx.bc->ar_reg + 7;
2048	} else {
2049		ctx.temp_reg = ctx.bc->ar_reg + 3;
2050	}
2051
2052	shader->max_arrays = 0;
2053	shader->num_arrays = 0;
2054	if (indirect_gprs) {
2055
2056		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2057			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2058			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
2059			                   ctx.file_offset[TGSI_FILE_INPUT],
2060			                   0x0F);
2061		}
2062		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2063			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2064			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
2065			                   ctx.file_offset[TGSI_FILE_OUTPUT],
2066			                   0x0F);
2067		}
2068	}
2069
2070	ctx.nliterals = 0;
2071	ctx.literals = NULL;
2072
2073	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2074	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2075	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2076
2077	if (shader->vs_as_gs_a)
2078		vs_add_primid_output(&ctx, key.vs.prim_id_out);
2079
2080	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2081		tgsi_parse_token(&ctx.parse);
2082		switch (ctx.parse.FullToken.Token.Type) {
2083		case TGSI_TOKEN_TYPE_IMMEDIATE:
2084			immediate = &ctx.parse.FullToken.FullImmediate;
2085			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2086			if(ctx.literals == NULL) {
2087				r = -ENOMEM;
2088				goto out_err;
2089			}
2090			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2091			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2092			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2093			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2094			ctx.nliterals++;
2095			break;
2096		case TGSI_TOKEN_TYPE_DECLARATION:
2097			r = tgsi_declaration(&ctx);
2098			if (r)
2099				goto out_err;
2100			break;
2101		case TGSI_TOKEN_TYPE_INSTRUCTION:
2102		case TGSI_TOKEN_TYPE_PROPERTY:
2103			break;
2104		default:
2105			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2106			r = -EINVAL;
2107			goto out_err;
2108		}
2109	}
2110
2111	shader->ring_item_sizes[0] = ctx.next_ring_offset;
2112	shader->ring_item_sizes[1] = 0;
2113	shader->ring_item_sizes[2] = 0;
2114	shader->ring_item_sizes[3] = 0;
2115
2116	/* Process two side if needed */
2117	if (shader->two_side && ctx.colors_used) {
2118		int i, count = ctx.shader->ninput;
2119		unsigned next_lds_loc = ctx.shader->nlds;
2120
2121		/* additional inputs will be allocated right after the existing inputs,
2122		 * we won't need them after the color selection, so we don't need to
2123		 * reserve these gprs for the rest of the shader code and to adjust
2124		 * output offsets etc. */
2125		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2126				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2127
2128		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2129		if (ctx.face_gpr == -1) {
2130			i = ctx.shader->ninput++;
2131			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2132			ctx.shader->input[i].spi_sid = 0;
2133			ctx.shader->input[i].gpr = gpr++;
2134			ctx.face_gpr = ctx.shader->input[i].gpr;
2135		}
2136
2137		for (i = 0; i < count; i++) {
2138			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2139				int ni = ctx.shader->ninput++;
2140				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2141				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2142				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2143				ctx.shader->input[ni].gpr = gpr++;
2144				// TGSI to LLVM needs to know the lds position of inputs.
2145				// Non LLVM path computes it later (in process_twoside_color)
2146				ctx.shader->input[ni].lds_pos = next_lds_loc++;
2147				ctx.shader->input[i].back_color_input = ni;
2148				if (ctx.bc->chip_class >= EVERGREEN) {
2149					if ((r = evergreen_interp_input(&ctx, ni)))
2150						return r;
2151				}
2152			}
2153		}
2154	}
2155
2156/* LLVM backend setup */
2157#ifdef R600_USE_LLVM
2158	if (use_llvm) {
2159		struct radeon_llvm_context radeon_llvm_ctx;
2160		LLVMModuleRef mod;
2161		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2162		boolean use_kill = false;
2163
2164		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2165		radeon_llvm_ctx.type = ctx.type;
2166		radeon_llvm_ctx.two_side = shader->two_side;
2167		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2168		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2169		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2170		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2171		radeon_llvm_ctx.color_buffer_count = max_color_exports;
2172		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2173		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2174		radeon_llvm_ctx.stream_outputs = &so;
2175		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2176		radeon_llvm_ctx.has_compressed_msaa_texturing =
2177			ctx.bc->has_compressed_msaa_texturing;
2178		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2179		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2180		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2181
2182		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2183			radeon_llvm_dispose(&radeon_llvm_ctx);
2184			use_llvm = 0;
2185			fprintf(stderr, "R600 LLVM backend failed to compile "
2186				"shader.  Falling back to TGSI\n");
2187		} else {
2188			ctx.file_offset[TGSI_FILE_OUTPUT] =
2189					ctx.file_offset[TGSI_FILE_INPUT];
2190		}
2191		if (use_kill)
2192			ctx.shader->uses_kill = use_kill;
2193		radeon_llvm_dispose(&radeon_llvm_ctx);
2194	}
2195#endif
2196/* End of LLVM backend setup */
2197
2198	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2199		shader->nr_ps_max_color_exports = 8;
2200
2201	if (!use_llvm) {
2202		if (ctx.fragcoord_input >= 0) {
2203			if (ctx.bc->chip_class == CAYMAN) {
2204				for (j = 0 ; j < 4; j++) {
2205					struct r600_bytecode_alu alu;
2206					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2207					alu.op = ALU_OP1_RECIP_IEEE;
2208					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2209					alu.src[0].chan = 3;
2210
2211					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2212					alu.dst.chan = j;
2213					alu.dst.write = (j == 3);
2214					alu.last = 1;
2215					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2216						return r;
2217				}
2218			} else {
2219				struct r600_bytecode_alu alu;
2220				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2221				alu.op = ALU_OP1_RECIP_IEEE;
2222				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2223				alu.src[0].chan = 3;
2224
2225				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2226				alu.dst.chan = 3;
2227				alu.dst.write = 1;
2228				alu.last = 1;
2229				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2230					return r;
2231			}
2232		}
2233
2234		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2235			struct r600_bytecode_alu alu;
2236			int r;
2237
2238			/* GS thread with no output workaround - emit a cut at start of GS */
2239			if (ctx.bc->chip_class == R600)
2240				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
2241
2242			for (j = 0; j < 4; j++) {
2243				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2244				alu.op = ALU_OP1_MOV;
2245				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2246				alu.src[0].value = 0;
2247				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2248				alu.dst.write = 1;
2249				alu.last = 1;
2250				r = r600_bytecode_add_alu(ctx.bc, &alu);
2251				if (r)
2252					return r;
2253			}
2254		}
2255		if (shader->two_side && ctx.colors_used) {
2256			if ((r = process_twoside_color_inputs(&ctx)))
2257				return r;
2258		}
2259
2260		tgsi_parse_init(&ctx.parse, tokens);
2261		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2262			tgsi_parse_token(&ctx.parse);
2263			switch (ctx.parse.FullToken.Token.Type) {
2264			case TGSI_TOKEN_TYPE_INSTRUCTION:
2265				r = tgsi_is_supported(&ctx);
2266				if (r)
2267					goto out_err;
2268				ctx.max_driver_temp_used = 0;
2269				/* reserve first tmp for everyone */
2270				r600_get_temp(&ctx);
2271
2272				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2273				if ((r = tgsi_split_constant(&ctx)))
2274					goto out_err;
2275				if ((r = tgsi_split_literal_constant(&ctx)))
2276					goto out_err;
2277				if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2278					if ((r = tgsi_split_gs_inputs(&ctx)))
2279						goto out_err;
2280				if (ctx.bc->chip_class == CAYMAN)
2281					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2282				else if (ctx.bc->chip_class >= EVERGREEN)
2283					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2284				else
2285					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2286				r = ctx.inst_info->process(&ctx);
2287				if (r)
2288					goto out_err;
2289				break;
2290			default:
2291				break;
2292			}
2293		}
2294	}
2295
2296	/* Reset the temporary register counter. */
2297	ctx.max_driver_temp_used = 0;
2298
2299	noutput = shader->noutput;
2300
2301	if (!ring_outputs && ctx.clip_vertex_write) {
2302		unsigned clipdist_temp[2];
2303
2304		clipdist_temp[0] = r600_get_temp(&ctx);
2305		clipdist_temp[1] = r600_get_temp(&ctx);
2306
2307		/* need to convert a clipvertex write into clipdistance writes and not export
2308		   the clip vertex anymore */
2309
2310		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2311		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2312		shader->output[noutput].gpr = clipdist_temp[0];
2313		noutput++;
2314		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2315		shader->output[noutput].gpr = clipdist_temp[1];
2316		noutput++;
2317
2318		/* reset spi_sid for clipvertex output to avoid confusing spi */
2319		shader->output[ctx.cv_output].spi_sid = 0;
2320
2321		shader->clip_dist_write = 0xFF;
2322
2323		for (i = 0; i < 8; i++) {
2324			int oreg = i >> 2;
2325			int ochan = i & 3;
2326
2327			for (j = 0; j < 4; j++) {
2328				struct r600_bytecode_alu alu;
2329				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2330				alu.op = ALU_OP2_DOT4;
2331				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2332				alu.src[0].chan = j;
2333
2334				alu.src[1].sel = 512 + i;
2335				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2336				alu.src[1].chan = j;
2337
2338				alu.dst.sel = clipdist_temp[oreg];
2339				alu.dst.chan = j;
2340				alu.dst.write = (j == ochan);
2341				if (j == 3)
2342					alu.last = 1;
2343				if (!use_llvm)
2344					r = r600_bytecode_add_alu(ctx.bc, &alu);
2345				if (r)
2346					return r;
2347			}
2348		}
2349	}
2350
2351	/* Add stream outputs. */
2352	if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2353	    so.num_outputs && !use_llvm)
2354		emit_streamout(&ctx, &so, -1, NULL);
2355
2356	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2357	convert_edgeflag_to_int(&ctx);
2358
2359	if (ring_outputs) {
2360		if (shader->vs_as_es) {
2361			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2362			ctx.gs_export_gpr_tregs[1] = -1;
2363			ctx.gs_export_gpr_tregs[2] = -1;
2364			ctx.gs_export_gpr_tregs[3] = -1;
2365
2366			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2367		}
2368	} else {
2369		/* Export output */
2370		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2371
2372		for (i = 0, j = 0; i < noutput; i++, j++) {
2373			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2374			output[j].gpr = shader->output[i].gpr;
2375			output[j].elem_size = 3;
2376			output[j].swizzle_x = 0;
2377			output[j].swizzle_y = 1;
2378			output[j].swizzle_z = 2;
2379			output[j].swizzle_w = 3;
2380			output[j].burst_count = 1;
2381			output[j].type = -1;
2382			output[j].op = CF_OP_EXPORT;
2383			switch (ctx.type) {
2384			case TGSI_PROCESSOR_VERTEX:
2385				switch (shader->output[i].name) {
2386				case TGSI_SEMANTIC_POSITION:
2387					output[j].array_base = 60;
2388					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2389					pos_emitted = true;
2390					break;
2391
2392				case TGSI_SEMANTIC_PSIZE:
2393					output[j].array_base = 61;
2394					output[j].swizzle_y = 7;
2395					output[j].swizzle_z = 7;
2396					output[j].swizzle_w = 7;
2397					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2398					pos_emitted = true;
2399					break;
2400				case TGSI_SEMANTIC_EDGEFLAG:
2401					output[j].array_base = 61;
2402					output[j].swizzle_x = 7;
2403					output[j].swizzle_y = 0;
2404					output[j].swizzle_z = 7;
2405					output[j].swizzle_w = 7;
2406					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2407					pos_emitted = true;
2408					break;
2409				case TGSI_SEMANTIC_LAYER:
2410					/* spi_sid is 0 for outputs that are
2411					 * not consumed by PS */
2412					if (shader->output[i].spi_sid) {
2413						output[j].array_base = next_param_base++;
2414						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2415						j++;
2416						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2417					}
2418					output[j].array_base = 61;
2419					output[j].swizzle_x = 7;
2420					output[j].swizzle_y = 7;
2421					output[j].swizzle_z = 0;
2422					output[j].swizzle_w = 7;
2423					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2424					pos_emitted = true;
2425					break;
2426				case TGSI_SEMANTIC_VIEWPORT_INDEX:
2427					/* spi_sid is 0 for outputs that are
2428					 * not consumed by PS */
2429					if (shader->output[i].spi_sid) {
2430						output[j].array_base = next_param_base++;
2431						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2432						j++;
2433						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2434					}
2435					output[j].array_base = 61;
2436					output[j].swizzle_x = 7;
2437					output[j].swizzle_y = 7;
2438					output[j].swizzle_z = 7;
2439					output[j].swizzle_w = 0;
2440					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2441					pos_emitted = true;
2442					break;
2443				case TGSI_SEMANTIC_CLIPVERTEX:
2444					j--;
2445					break;
2446				case TGSI_SEMANTIC_CLIPDIST:
2447					output[j].array_base = next_clip_base++;
2448					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2449					pos_emitted = true;
2450					/* spi_sid is 0 for clipdistance outputs that were generated
2451					 * for clipvertex - we don't need to pass them to PS */
2452					if (shader->output[i].spi_sid) {
2453						j++;
2454						/* duplicate it as PARAM to pass to the pixel shader */
2455						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2456						output[j].array_base = next_param_base++;
2457						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2458					}
2459					break;
2460				case TGSI_SEMANTIC_FOG:
2461					output[j].swizzle_y = 4; /* 0 */
2462					output[j].swizzle_z = 4; /* 0 */
2463					output[j].swizzle_w = 5; /* 1 */
2464					break;
2465				case TGSI_SEMANTIC_PRIMID:
2466					output[j].swizzle_x = 2;
2467					output[j].swizzle_y = 4; /* 0 */
2468					output[j].swizzle_z = 4; /* 0 */
2469					output[j].swizzle_w = 4; /* 0 */
2470					break;
2471				}
2472
2473				break;
2474			case TGSI_PROCESSOR_FRAGMENT:
2475				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2476					/* never export more colors than the number of CBs */
2477					if (shader->output[i].sid >= max_color_exports) {
2478						/* skip export */
2479						j--;
2480						continue;
2481					}
2482					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2483					output[j].array_base = shader->output[i].sid;
2484					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2485					shader->nr_ps_color_exports++;
2486					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2487						for (k = 1; k < max_color_exports; k++) {
2488							j++;
2489							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2490							output[j].gpr = shader->output[i].gpr;
2491							output[j].elem_size = 3;
2492							output[j].swizzle_x = 0;
2493							output[j].swizzle_y = 1;
2494							output[j].swizzle_z = 2;
2495							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2496							output[j].burst_count = 1;
2497							output[j].array_base = k;
2498							output[j].op = CF_OP_EXPORT;
2499							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2500							shader->nr_ps_color_exports++;
2501						}
2502					}
2503				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2504					output[j].array_base = 61;
2505					output[j].swizzle_x = 2;
2506					output[j].swizzle_y = 7;
2507					output[j].swizzle_z = output[j].swizzle_w = 7;
2508					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2509				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2510					output[j].array_base = 61;
2511					output[j].swizzle_x = 7;
2512					output[j].swizzle_y = 1;
2513					output[j].swizzle_z = output[j].swizzle_w = 7;
2514					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2515				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2516					output[j].array_base = 61;
2517					output[j].swizzle_x = 7;
2518					output[j].swizzle_y = 7;
2519					output[j].swizzle_z = 0;
2520					output[j].swizzle_w = 7;
2521					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2522				} else {
2523					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2524					r = -EINVAL;
2525					goto out_err;
2526				}
2527				break;
2528			default:
2529				R600_ERR("unsupported processor type %d\n", ctx.type);
2530				r = -EINVAL;
2531				goto out_err;
2532			}
2533
2534			if (output[j].type==-1) {
2535				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2536				output[j].array_base = next_param_base++;
2537			}
2538		}
2539
2540		/* add fake position export */
2541		if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2542			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2543			output[j].gpr = 0;
2544			output[j].elem_size = 3;
2545			output[j].swizzle_x = 7;
2546			output[j].swizzle_y = 7;
2547			output[j].swizzle_z = 7;
2548			output[j].swizzle_w = 7;
2549			output[j].burst_count = 1;
2550			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2551			output[j].array_base = 60;
2552			output[j].op = CF_OP_EXPORT;
2553			j++;
2554		}
2555
2556		/* add fake param output for vertex shader if no param is exported */
2557		if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2558			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2559			output[j].gpr = 0;
2560			output[j].elem_size = 3;
2561			output[j].swizzle_x = 7;
2562			output[j].swizzle_y = 7;
2563			output[j].swizzle_z = 7;
2564			output[j].swizzle_w = 7;
2565			output[j].burst_count = 1;
2566			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2567			output[j].array_base = 0;
2568			output[j].op = CF_OP_EXPORT;
2569			j++;
2570		}
2571
2572		/* add fake pixel export */
2573		if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2574			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2575			output[j].gpr = 0;
2576			output[j].elem_size = 3;
2577			output[j].swizzle_x = 7;
2578			output[j].swizzle_y = 7;
2579			output[j].swizzle_z = 7;
2580			output[j].swizzle_w = 7;
2581			output[j].burst_count = 1;
2582			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2583			output[j].array_base = 0;
2584			output[j].op = CF_OP_EXPORT;
2585			j++;
2586			shader->nr_ps_color_exports++;
2587		}
2588
2589		noutput = j;
2590
2591		/* set export done on last export of each type */
2592		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2593			if (!(output_done & (1 << output[i].type))) {
2594				output_done |= (1 << output[i].type);
2595				output[i].op = CF_OP_EXPORT_DONE;
2596			}
2597		}
2598		/* add output to bytecode */
2599		if (!use_llvm) {
2600			for (i = 0; i < noutput; i++) {
2601				r = r600_bytecode_add_output(ctx.bc, &output[i]);
2602				if (r)
2603					goto out_err;
2604			}
2605		}
2606	}
2607
2608	/* add program end */
2609	if (!use_llvm) {
2610		if (ctx.bc->chip_class == CAYMAN)
2611			cm_bytecode_add_cf_end(ctx.bc);
2612		else {
2613			const struct cf_op_info *last = NULL;
2614
2615			if (ctx.bc->cf_last)
2616				last = r600_isa_cf(ctx.bc->cf_last->op);
2617
2618			/* alu clause instructions don't have EOP bit, so add NOP */
2619			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2620				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2621
2622			ctx.bc->cf_last->end_of_program = 1;
2623		}
2624	}
2625
2626	/* check GPR limit - we have 124 = 128 - 4
2627	 * (4 are reserved as alu clause temporary registers) */
2628	if (ctx.bc->ngpr > 124) {
2629		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2630		r = -ENOMEM;
2631		goto out_err;
2632	}
2633
2634	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2635		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2636			return r;
2637	}
2638
2639	free(ctx.literals);
2640	tgsi_parse_free(&ctx.parse);
2641	return 0;
2642out_err:
2643	free(ctx.literals);
2644	tgsi_parse_free(&ctx.parse);
2645	return r;
2646}
2647
2648static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2649{
2650	const unsigned tgsi_opcode =
2651		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2652	R600_ERR("%s tgsi opcode unsupported\n",
2653		 tgsi_get_opcode_name(tgsi_opcode));
2654	return -EINVAL;
2655}
2656
2657static int tgsi_end(struct r600_shader_ctx *ctx)
2658{
2659	return 0;
2660}
2661
2662static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2663			const struct r600_shader_src *shader_src,
2664			unsigned chan)
2665{
2666	bc_src->sel = shader_src->sel;
2667	bc_src->chan = shader_src->swizzle[chan];
2668	bc_src->neg = shader_src->neg;
2669	bc_src->abs = shader_src->abs;
2670	bc_src->rel = shader_src->rel;
2671	bc_src->value = shader_src->value[bc_src->chan];
2672	bc_src->kc_bank = shader_src->kc_bank;
2673	bc_src->kc_rel = shader_src->kc_rel;
2674}
2675
2676static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2677{
2678	bc_src->abs = 1;
2679	bc_src->neg = 0;
2680}
2681
2682static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2683{
2684	bc_src->neg = !bc_src->neg;
2685}
2686
2687static void tgsi_dst(struct r600_shader_ctx *ctx,
2688		     const struct tgsi_full_dst_register *tgsi_dst,
2689		     unsigned swizzle,
2690		     struct r600_bytecode_alu_dst *r600_dst)
2691{
2692	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2693
2694	r600_dst->sel = tgsi_dst->Register.Index;
2695	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2696	r600_dst->chan = swizzle;
2697	r600_dst->write = 1;
2698	if (tgsi_dst->Register.Indirect)
2699		r600_dst->rel = V_SQ_REL_RELATIVE;
2700	if (inst->Instruction.Saturate) {
2701		r600_dst->clamp = 1;
2702	}
2703}
2704
2705static int tgsi_last_instruction(unsigned writemask)
2706{
2707	int i, lasti = 0;
2708
2709	for (i = 0; i < 4; i++) {
2710		if (writemask & (1 << i)) {
2711			lasti = i;
2712		}
2713	}
2714	return lasti;
2715}
2716
2717
2718
2719static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2720{
2721	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2722	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2723	struct r600_bytecode_alu alu;
2724	int i, j, r, lasti = tgsi_last_instruction(write_mask);
2725	int use_tmp = 0;
2726
2727	if (singledest) {
2728		switch (write_mask) {
2729		case 0x1:
2730			write_mask = 0x3;
2731			break;
2732		case 0x2:
2733			use_tmp = 1;
2734			write_mask = 0x3;
2735			break;
2736		case 0x4:
2737			write_mask = 0xc;
2738			break;
2739		case 0x8:
2740			write_mask = 0xc;
2741			use_tmp = 3;
2742			break;
2743		}
2744	}
2745
2746	lasti = tgsi_last_instruction(write_mask);
2747	for (i = 0; i <= lasti; i++) {
2748
2749		if (!(write_mask & (1 << i)))
2750			continue;
2751
2752		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2753
2754		if (singledest) {
2755			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2756			if (use_tmp) {
2757				alu.dst.sel = ctx->temp_reg;
2758				alu.dst.chan = i;
2759				alu.dst.write = 1;
2760			}
2761			if (i == 1 || i == 3)
2762				alu.dst.write = 0;
2763		} else
2764			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2765
2766		alu.op = ctx->inst_info->op;
2767		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
2768			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2769		} else if (!swap) {
2770			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2771				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
2772			}
2773		} else {
2774			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
2775			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
2776		}
2777
2778		/* handle some special cases */
2779		if (i == 1 || i == 3) {
2780			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
2781			case TGSI_OPCODE_SUB:
2782				r600_bytecode_src_toggle_neg(&alu.src[1]);
2783				break;
2784			case TGSI_OPCODE_DABS:
2785				r600_bytecode_src_set_abs(&alu.src[0]);
2786				break;
2787			default:
2788				break;
2789			}
2790		}
2791		if (i == lasti) {
2792			alu.last = 1;
2793		}
2794		r = r600_bytecode_add_alu(ctx->bc, &alu);
2795		if (r)
2796			return r;
2797	}
2798
2799	if (use_tmp) {
2800		write_mask = inst->Dst[0].Register.WriteMask;
2801
2802		/* move result from temp to dst */
2803		for (i = 0; i <= lasti; i++) {
2804			if (!(write_mask & (1 << i)))
2805				continue;
2806
2807			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2808			alu.op = ALU_OP1_MOV;
2809			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2810			alu.src[0].sel = ctx->temp_reg;
2811			alu.src[0].chan = use_tmp - 1;
2812			alu.last = (i == lasti);
2813
2814			r = r600_bytecode_add_alu(ctx->bc, &alu);
2815			if (r)
2816				return r;
2817		}
2818	}
2819	return 0;
2820}
2821
2822static int tgsi_op2_64(struct r600_shader_ctx *ctx)
2823{
2824	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2825	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2826	/* confirm writemasking */
2827	if ((write_mask & 0x3) != 0x3 &&
2828	    (write_mask & 0xc) != 0xc) {
2829		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
2830		return -1;
2831	}
2832	return tgsi_op2_64_params(ctx, false, false);
2833}
2834
2835static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
2836{
2837	return tgsi_op2_64_params(ctx, true, false);
2838}
2839
2840static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
2841{
2842	return tgsi_op2_64_params(ctx, true, true);
2843}
2844
2845static int tgsi_op3_64(struct r600_shader_ctx *ctx)
2846{
2847	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2848	struct r600_bytecode_alu alu;
2849	int i, j, r;
2850	int lasti = 3;
2851	int tmp = r600_get_temp(ctx);
2852
2853	for (i = 0; i < lasti + 1; i++) {
2854
2855		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2856		alu.op = ctx->inst_info->op;
2857		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2858			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
2859		}
2860
2861		if (inst->Dst[0].Register.WriteMask & (1 << i))
2862			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2863		else
2864			alu.dst.sel = tmp;
2865
2866		alu.dst.chan = i;
2867		alu.is_op3 = 1;
2868		if (i == lasti) {
2869			alu.last = 1;
2870		}
2871		r = r600_bytecode_add_alu(ctx->bc, &alu);
2872		if (r)
2873			return r;
2874	}
2875	return 0;
2876}
2877
2878static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2879{
2880	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2881	struct r600_bytecode_alu alu;
2882	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2883	int i, j, r, lasti = tgsi_last_instruction(write_mask);
2884	/* use temp register if trans_only and more than one dst component */
2885	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2886
2887	for (i = 0; i <= lasti; i++) {
2888		if (!(write_mask & (1 << i)))
2889			continue;
2890
2891		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2892		if (use_tmp) {
2893			alu.dst.sel = ctx->temp_reg;
2894			alu.dst.chan = i;
2895			alu.dst.write = 1;
2896		} else
2897			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2898
2899		alu.op = ctx->inst_info->op;
2900		if (!swap) {
2901			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2902				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2903			}
2904		} else {
2905			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2906			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2907		}
2908		/* handle some special cases */
2909		switch (inst->Instruction.Opcode) {
2910		case TGSI_OPCODE_SUB:
2911			r600_bytecode_src_toggle_neg(&alu.src[1]);
2912			break;
2913		case TGSI_OPCODE_ABS:
2914			r600_bytecode_src_set_abs(&alu.src[0]);
2915			break;
2916		default:
2917			break;
2918		}
2919		if (i == lasti || trans_only) {
2920			alu.last = 1;
2921		}
2922		r = r600_bytecode_add_alu(ctx->bc, &alu);
2923		if (r)
2924			return r;
2925	}
2926
2927	if (use_tmp) {
2928		/* move result from temp to dst */
2929		for (i = 0; i <= lasti; i++) {
2930			if (!(write_mask & (1 << i)))
2931				continue;
2932
2933			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2934			alu.op = ALU_OP1_MOV;
2935			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2936			alu.src[0].sel = ctx->temp_reg;
2937			alu.src[0].chan = i;
2938			alu.last = (i == lasti);
2939
2940			r = r600_bytecode_add_alu(ctx->bc, &alu);
2941			if (r)
2942				return r;
2943		}
2944	}
2945	return 0;
2946}
2947
2948static int tgsi_op2(struct r600_shader_ctx *ctx)
2949{
2950	return tgsi_op2_s(ctx, 0, 0);
2951}
2952
2953static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2954{
2955	return tgsi_op2_s(ctx, 1, 0);
2956}
2957
2958static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2959{
2960	return tgsi_op2_s(ctx, 0, 1);
2961}
2962
2963static int tgsi_ineg(struct r600_shader_ctx *ctx)
2964{
2965	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2966	struct r600_bytecode_alu alu;
2967	int i, r;
2968	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2969
2970	for (i = 0; i < lasti + 1; i++) {
2971
2972		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2973			continue;
2974		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2975		alu.op = ctx->inst_info->op;
2976
2977		alu.src[0].sel = V_SQ_ALU_SRC_0;
2978
2979		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2980
2981		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2982
2983		if (i == lasti) {
2984			alu.last = 1;
2985		}
2986		r = r600_bytecode_add_alu(ctx->bc, &alu);
2987		if (r)
2988			return r;
2989	}
2990	return 0;
2991
2992}
2993
2994static int tgsi_dneg(struct r600_shader_ctx *ctx)
2995{
2996	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2997	struct r600_bytecode_alu alu;
2998	int i, r;
2999	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3000
3001	for (i = 0; i < lasti + 1; i++) {
3002
3003		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3004			continue;
3005		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3006		alu.op = ALU_OP1_MOV;
3007
3008		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3009
3010		if (i == 1 || i == 3)
3011			r600_bytecode_src_toggle_neg(&alu.src[0]);
3012		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3013
3014		if (i == lasti) {
3015			alu.last = 1;
3016		}
3017		r = r600_bytecode_add_alu(ctx->bc, &alu);
3018		if (r)
3019			return r;
3020	}
3021	return 0;
3022
3023}
3024
3025static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3026{
3027	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3028	struct r600_bytecode_alu alu;
3029	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3030	int i, j, r;
3031	int firsti = write_mask == 0xc ? 2 : 0;
3032
3033	for (i = 0; i <= 3; i++) {
3034		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3035		alu.op = ctx->inst_info->op;
3036
3037		alu.dst.sel = ctx->temp_reg;
3038		alu.dst.chan = i;
3039		alu.dst.write = 1;
3040		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3041			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3042		}
3043
3044		if (i == 3)
3045			alu.last = 1;
3046
3047		r = r600_bytecode_add_alu(ctx->bc, &alu);
3048		if (r)
3049			return r;
3050	}
3051
3052	/* MOV first two channels to writemask dst0 */
3053	for (i = 0; i <= 1; i++) {
3054		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3055		alu.op = ALU_OP1_MOV;
3056		alu.src[0].chan = i + 2;
3057		alu.src[0].sel = ctx->temp_reg;
3058
3059		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3060		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3061		alu.last = 1;
3062		r = r600_bytecode_add_alu(ctx->bc, &alu);
3063		if (r)
3064			return r;
3065	}
3066
3067	for (i = 0; i <= 3; i++) {
3068		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3069			/* MOV third channels to writemask dst1 */
3070			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3071			alu.op = ALU_OP1_MOV;
3072			alu.src[0].chan = 1;
3073			alu.src[0].sel = ctx->temp_reg;
3074
3075			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3076			alu.last = 1;
3077			r = r600_bytecode_add_alu(ctx->bc, &alu);
3078			if (r)
3079				return r;
3080			break;
3081		}
3082	}
3083	return 0;
3084}
3085
3086
3087static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3088{
3089	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3090	struct r600_bytecode_alu alu;
3091	int i, r;
3092	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3093
3094	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3095		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3096
3097	for (i = 0; i <= (lasti+1)/2; i++) {
3098		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3099		alu.op = ctx->inst_info->op;
3100
3101		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3102		alu.dst.sel = ctx->temp_reg;
3103		alu.dst.chan = i;
3104		alu.dst.write = 1;
3105		alu.last = 1;
3106
3107		r = r600_bytecode_add_alu(ctx->bc, &alu);
3108		if (r)
3109			return r;
3110	}
3111
3112	for (i = 0; i <= lasti; i++) {
3113		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3114		alu.op = ALU_OP1_FLT32_TO_FLT64;
3115
3116		alu.src[0].chan = i/2;
3117		if (i%2 == 0)
3118			alu.src[0].sel = ctx->temp_reg;
3119		else {
3120			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3121			alu.src[0].value = 0x0;
3122		}
3123		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3124		alu.last = i == lasti;
3125
3126		r = r600_bytecode_add_alu(ctx->bc, &alu);
3127		if (r)
3128			return r;
3129	}
3130
3131	return 0;
3132}
3133
3134static int egcm_double_to_int(struct r600_shader_ctx *ctx)
3135{
3136	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3137	struct r600_bytecode_alu alu;
3138	int i, r;
3139	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3140
3141	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
3142		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
3143
3144	for (i = 0; i <= lasti; i++) {
3145		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3146		alu.op = ALU_OP1_FLT64_TO_FLT32;
3147
3148		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
3149		alu.dst.chan = i;
3150		alu.dst.sel = ctx->temp_reg;
3151		alu.dst.write = i%2 == 0;
3152		alu.last = i == lasti;
3153
3154		r = r600_bytecode_add_alu(ctx->bc, &alu);
3155		if (r)
3156			return r;
3157	}
3158
3159	for (i = 0; i <= (lasti+1)/2; i++) {
3160		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3161		alu.op = ctx->inst_info->op;
3162
3163		alu.src[0].chan = i*2;
3164		alu.src[0].sel = ctx->temp_reg;
3165		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3166		alu.last = 1;
3167
3168		r = r600_bytecode_add_alu(ctx->bc, &alu);
3169		if (r)
3170			return r;
3171	}
3172
3173	return 0;
3174}
3175
3176static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3177{
3178	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3179	int i, r;
3180	struct r600_bytecode_alu alu;
3181	int last_slot = 3;
3182	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3183	int t1 = ctx->temp_reg;
3184
3185	/* these have to write the result to X/Y by the looks of it */
3186	for (i = 0 ; i < last_slot; i++) {
3187		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3188		alu.op = ctx->inst_info->op;
3189
3190		/* should only be one src regs */
3191		assert (inst->Instruction.NumSrcRegs == 1);
3192
3193		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3194		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3195
3196		/* RSQ should take the absolute value of src */
3197		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3198		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3199			r600_bytecode_src_set_abs(&alu.src[1]);
3200		}
3201		alu.dst.sel = t1;
3202		alu.dst.chan = i;
3203		alu.dst.write = (i == 0 || i == 1);
3204
3205		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3206			alu.last = 1;
3207		r = r600_bytecode_add_alu(ctx->bc, &alu);
3208		if (r)
3209			return r;
3210	}
3211
3212	for (i = 0 ; i <= lasti; i++) {
3213		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3214			continue;
3215		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3216		alu.op = ALU_OP1_MOV;
3217		alu.src[0].sel = t1;
3218		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3219		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3220		alu.dst.write = 1;
3221		if (i == lasti)
3222			alu.last = 1;
3223		r = r600_bytecode_add_alu(ctx->bc, &alu);
3224		if (r)
3225			return r;
3226	}
3227	return 0;
3228}
3229
3230static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3231{
3232	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3233	int i, j, r;
3234	struct r600_bytecode_alu alu;
3235	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3236
3237	for (i = 0 ; i < last_slot; i++) {
3238		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3239		alu.op = ctx->inst_info->op;
3240		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3241			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3242
3243			/* RSQ should take the absolute value of src */
3244			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3245				r600_bytecode_src_set_abs(&alu.src[j]);
3246			}
3247		}
3248		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3249		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3250
3251		if (i == last_slot - 1)
3252			alu.last = 1;
3253		r = r600_bytecode_add_alu(ctx->bc, &alu);
3254		if (r)
3255			return r;
3256	}
3257	return 0;
3258}
3259
3260static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3261{
3262	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3263	int i, j, k, r;
3264	struct r600_bytecode_alu alu;
3265	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3266	int t1 = ctx->temp_reg;
3267
3268	for (k = 0; k <= lasti; k++) {
3269		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3270			continue;
3271
3272		for (i = 0 ; i < 4; i++) {
3273			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3274			alu.op = ctx->inst_info->op;
3275			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3276				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3277			}
3278			alu.dst.sel = t1;
3279			alu.dst.chan = i;
3280			alu.dst.write = (i == k);
3281			if (i == 3)
3282				alu.last = 1;
3283			r = r600_bytecode_add_alu(ctx->bc, &alu);
3284			if (r)
3285				return r;
3286		}
3287	}
3288
3289	for (i = 0 ; i <= lasti; i++) {
3290		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3291			continue;
3292		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3293		alu.op = ALU_OP1_MOV;
3294		alu.src[0].sel = t1;
3295		alu.src[0].chan = i;
3296		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3297		alu.dst.write = 1;
3298		if (i == lasti)
3299			alu.last = 1;
3300		r = r600_bytecode_add_alu(ctx->bc, &alu);
3301		if (r)
3302			return r;
3303	}
3304
3305	return 0;
3306}
3307
3308
3309static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3310{
3311	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3312	int i, j, k, r;
3313	struct r600_bytecode_alu alu;
3314	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3315	int t1 = ctx->temp_reg;
3316
3317	for (k = 0; k < 2; k++) {
3318		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3319			continue;
3320
3321		for (i = 0; i < 4; i++) {
3322			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3323			alu.op = ctx->inst_info->op;
3324			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3325				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3326			}
3327			alu.dst.sel = t1;
3328			alu.dst.chan = i;
3329			alu.dst.write = 1;
3330			if (i == 3)
3331				alu.last = 1;
3332			r = r600_bytecode_add_alu(ctx->bc, &alu);
3333			if (r)
3334				return r;
3335		}
3336	}
3337
3338	for (i = 0; i <= lasti; i++) {
3339		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3340			continue;
3341		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3342		alu.op = ALU_OP1_MOV;
3343		alu.src[0].sel = t1;
3344		alu.src[0].chan = i;
3345		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3346		alu.dst.write = 1;
3347		if (i == lasti)
3348			alu.last = 1;
3349		r = r600_bytecode_add_alu(ctx->bc, &alu);
3350		if (r)
3351			return r;
3352	}
3353
3354	return 0;
3355}
3356
3357/*
3358 * r600 - trunc to -PI..PI range
3359 * r700 - normalize by dividing by 2PI
3360 * see fdo bug 27901
3361 */
3362static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3363{
3364	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3365	static float double_pi = 3.1415926535 * 2;
3366	static float neg_pi = -3.1415926535;
3367
3368	int r;
3369	struct r600_bytecode_alu alu;
3370
3371	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3372	alu.op = ALU_OP3_MULADD;
3373	alu.is_op3 = 1;
3374
3375	alu.dst.chan = 0;
3376	alu.dst.sel = ctx->temp_reg;
3377	alu.dst.write = 1;
3378
3379	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3380
3381	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3382	alu.src[1].chan = 0;
3383	alu.src[1].value = *(uint32_t *)&half_inv_pi;
3384	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3385	alu.src[2].chan = 0;
3386	alu.last = 1;
3387	r = r600_bytecode_add_alu(ctx->bc, &alu);
3388	if (r)
3389		return r;
3390
3391	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3392	alu.op = ALU_OP1_FRACT;
3393
3394	alu.dst.chan = 0;
3395	alu.dst.sel = ctx->temp_reg;
3396	alu.dst.write = 1;
3397
3398	alu.src[0].sel = ctx->temp_reg;
3399	alu.src[0].chan = 0;
3400	alu.last = 1;
3401	r = r600_bytecode_add_alu(ctx->bc, &alu);
3402	if (r)
3403		return r;
3404
3405	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3406	alu.op = ALU_OP3_MULADD;
3407	alu.is_op3 = 1;
3408
3409	alu.dst.chan = 0;
3410	alu.dst.sel = ctx->temp_reg;
3411	alu.dst.write = 1;
3412
3413	alu.src[0].sel = ctx->temp_reg;
3414	alu.src[0].chan = 0;
3415
3416	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3417	alu.src[1].chan = 0;
3418	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3419	alu.src[2].chan = 0;
3420
3421	if (ctx->bc->chip_class == R600) {
3422		alu.src[1].value = *(uint32_t *)&double_pi;
3423		alu.src[2].value = *(uint32_t *)&neg_pi;
3424	} else {
3425		alu.src[1].sel = V_SQ_ALU_SRC_1;
3426		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3427		alu.src[2].neg = 1;
3428	}
3429
3430	alu.last = 1;
3431	r = r600_bytecode_add_alu(ctx->bc, &alu);
3432	if (r)
3433		return r;
3434	return 0;
3435}
3436
3437static int cayman_trig(struct r600_shader_ctx *ctx)
3438{
3439	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3440	struct r600_bytecode_alu alu;
3441	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3442	int i, r;
3443
3444	r = tgsi_setup_trig(ctx);
3445	if (r)
3446		return r;
3447
3448
3449	for (i = 0; i < last_slot; i++) {
3450		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3451		alu.op = ctx->inst_info->op;
3452		alu.dst.chan = i;
3453
3454		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3455		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3456
3457		alu.src[0].sel = ctx->temp_reg;
3458		alu.src[0].chan = 0;
3459		if (i == last_slot - 1)
3460			alu.last = 1;
3461		r = r600_bytecode_add_alu(ctx->bc, &alu);
3462		if (r)
3463			return r;
3464	}
3465	return 0;
3466}
3467
3468static int tgsi_trig(struct r600_shader_ctx *ctx)
3469{
3470	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3471	struct r600_bytecode_alu alu;
3472	int i, r;
3473	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3474
3475	r = tgsi_setup_trig(ctx);
3476	if (r)
3477		return r;
3478
3479	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3480	alu.op = ctx->inst_info->op;
3481	alu.dst.chan = 0;
3482	alu.dst.sel = ctx->temp_reg;
3483	alu.dst.write = 1;
3484
3485	alu.src[0].sel = ctx->temp_reg;
3486	alu.src[0].chan = 0;
3487	alu.last = 1;
3488	r = r600_bytecode_add_alu(ctx->bc, &alu);
3489	if (r)
3490		return r;
3491
3492	/* replicate result */
3493	for (i = 0; i < lasti + 1; i++) {
3494		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3495			continue;
3496
3497		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3498		alu.op = ALU_OP1_MOV;
3499
3500		alu.src[0].sel = ctx->temp_reg;
3501		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3502		if (i == lasti)
3503			alu.last = 1;
3504		r = r600_bytecode_add_alu(ctx->bc, &alu);
3505		if (r)
3506			return r;
3507	}
3508	return 0;
3509}
3510
3511static int tgsi_scs(struct r600_shader_ctx *ctx)
3512{
3513	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3514	struct r600_bytecode_alu alu;
3515	int i, r;
3516
3517	/* We'll only need the trig stuff if we are going to write to the
3518	 * X or Y components of the destination vector.
3519	 */
3520	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3521		r = tgsi_setup_trig(ctx);
3522		if (r)
3523			return r;
3524	}
3525
3526	/* dst.x = COS */
3527	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3528		if (ctx->bc->chip_class == CAYMAN) {
3529			for (i = 0 ; i < 3; i++) {
3530				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3531				alu.op = ALU_OP1_COS;
3532				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3533
3534				if (i == 0)
3535					alu.dst.write = 1;
3536				else
3537					alu.dst.write = 0;
3538				alu.src[0].sel = ctx->temp_reg;
3539				alu.src[0].chan = 0;
3540				if (i == 2)
3541					alu.last = 1;
3542				r = r600_bytecode_add_alu(ctx->bc, &alu);
3543				if (r)
3544					return r;
3545			}
3546		} else {
3547			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3548			alu.op = ALU_OP1_COS;
3549			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3550
3551			alu.src[0].sel = ctx->temp_reg;
3552			alu.src[0].chan = 0;
3553			alu.last = 1;
3554			r = r600_bytecode_add_alu(ctx->bc, &alu);
3555			if (r)
3556				return r;
3557		}
3558	}
3559
3560	/* dst.y = SIN */
3561	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3562		if (ctx->bc->chip_class == CAYMAN) {
3563			for (i = 0 ; i < 3; i++) {
3564				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3565				alu.op = ALU_OP1_SIN;
3566				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3567				if (i == 1)
3568					alu.dst.write = 1;
3569				else
3570					alu.dst.write = 0;
3571				alu.src[0].sel = ctx->temp_reg;
3572				alu.src[0].chan = 0;
3573				if (i == 2)
3574					alu.last = 1;
3575				r = r600_bytecode_add_alu(ctx->bc, &alu);
3576				if (r)
3577					return r;
3578			}
3579		} else {
3580			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3581			alu.op = ALU_OP1_SIN;
3582			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3583
3584			alu.src[0].sel = ctx->temp_reg;
3585			alu.src[0].chan = 0;
3586			alu.last = 1;
3587			r = r600_bytecode_add_alu(ctx->bc, &alu);
3588			if (r)
3589				return r;
3590		}
3591	}
3592
3593	/* dst.z = 0.0; */
3594	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3595		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3596
3597		alu.op = ALU_OP1_MOV;
3598
3599		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3600
3601		alu.src[0].sel = V_SQ_ALU_SRC_0;
3602		alu.src[0].chan = 0;
3603
3604		alu.last = 1;
3605
3606		r = r600_bytecode_add_alu(ctx->bc, &alu);
3607		if (r)
3608			return r;
3609	}
3610
3611	/* dst.w = 1.0; */
3612	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3613		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3614
3615		alu.op = ALU_OP1_MOV;
3616
3617		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3618
3619		alu.src[0].sel = V_SQ_ALU_SRC_1;
3620		alu.src[0].chan = 0;
3621
3622		alu.last = 1;
3623
3624		r = r600_bytecode_add_alu(ctx->bc, &alu);
3625		if (r)
3626			return r;
3627	}
3628
3629	return 0;
3630}
3631
3632static int tgsi_kill(struct r600_shader_ctx *ctx)
3633{
3634	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3635	struct r600_bytecode_alu alu;
3636	int i, r;
3637
3638	for (i = 0; i < 4; i++) {
3639		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3640		alu.op = ctx->inst_info->op;
3641
3642		alu.dst.chan = i;
3643
3644		alu.src[0].sel = V_SQ_ALU_SRC_0;
3645
3646		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3647			alu.src[1].sel = V_SQ_ALU_SRC_1;
3648			alu.src[1].neg = 1;
3649		} else {
3650			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3651		}
3652		if (i == 3) {
3653			alu.last = 1;
3654		}
3655		r = r600_bytecode_add_alu(ctx->bc, &alu);
3656		if (r)
3657			return r;
3658	}
3659
3660	/* kill must be last in ALU */
3661	ctx->bc->force_add_cf = 1;
3662	ctx->shader->uses_kill = TRUE;
3663	return 0;
3664}
3665
3666static int tgsi_lit(struct r600_shader_ctx *ctx)
3667{
3668	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3669	struct r600_bytecode_alu alu;
3670	int r;
3671
3672	/* tmp.x = max(src.y, 0.0) */
3673	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3674	alu.op = ALU_OP2_MAX;
3675	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3676	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
3677	alu.src[1].chan = 1;
3678
3679	alu.dst.sel = ctx->temp_reg;
3680	alu.dst.chan = 0;
3681	alu.dst.write = 1;
3682
3683	alu.last = 1;
3684	r = r600_bytecode_add_alu(ctx->bc, &alu);
3685	if (r)
3686		return r;
3687
3688	if (inst->Dst[0].Register.WriteMask & (1 << 2))
3689	{
3690		int chan;
3691		int sel;
3692		int i;
3693
3694		if (ctx->bc->chip_class == CAYMAN) {
3695			for (i = 0; i < 3; i++) {
3696				/* tmp.z = log(tmp.x) */
3697				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3698				alu.op = ALU_OP1_LOG_CLAMPED;
3699				alu.src[0].sel = ctx->temp_reg;
3700				alu.src[0].chan = 0;
3701				alu.dst.sel = ctx->temp_reg;
3702				alu.dst.chan = i;
3703				if (i == 2) {
3704					alu.dst.write = 1;
3705					alu.last = 1;
3706				} else
3707					alu.dst.write = 0;
3708
3709				r = r600_bytecode_add_alu(ctx->bc, &alu);
3710				if (r)
3711					return r;
3712			}
3713		} else {
3714			/* tmp.z = log(tmp.x) */
3715			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3716			alu.op = ALU_OP1_LOG_CLAMPED;
3717			alu.src[0].sel = ctx->temp_reg;
3718			alu.src[0].chan = 0;
3719			alu.dst.sel = ctx->temp_reg;
3720			alu.dst.chan = 2;
3721			alu.dst.write = 1;
3722			alu.last = 1;
3723			r = r600_bytecode_add_alu(ctx->bc, &alu);
3724			if (r)
3725				return r;
3726		}
3727
3728		chan = alu.dst.chan;
3729		sel = alu.dst.sel;
3730
3731		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3732		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3733		alu.op = ALU_OP3_MUL_LIT;
3734		alu.src[0].sel  = sel;
3735		alu.src[0].chan = chan;
3736		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3737		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3738		alu.dst.sel = ctx->temp_reg;
3739		alu.dst.chan = 0;
3740		alu.dst.write = 1;
3741		alu.is_op3 = 1;
3742		alu.last = 1;
3743		r = r600_bytecode_add_alu(ctx->bc, &alu);
3744		if (r)
3745			return r;
3746
3747		if (ctx->bc->chip_class == CAYMAN) {
3748			for (i = 0; i < 3; i++) {
3749				/* dst.z = exp(tmp.x) */
3750				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3751				alu.op = ALU_OP1_EXP_IEEE;
3752				alu.src[0].sel = ctx->temp_reg;
3753				alu.src[0].chan = 0;
3754				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3755				if (i == 2) {
3756					alu.dst.write = 1;
3757					alu.last = 1;
3758				} else
3759					alu.dst.write = 0;
3760				r = r600_bytecode_add_alu(ctx->bc, &alu);
3761				if (r)
3762					return r;
3763			}
3764		} else {
3765			/* dst.z = exp(tmp.x) */
3766			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3767			alu.op = ALU_OP1_EXP_IEEE;
3768			alu.src[0].sel = ctx->temp_reg;
3769			alu.src[0].chan = 0;
3770			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3771			alu.last = 1;
3772			r = r600_bytecode_add_alu(ctx->bc, &alu);
3773			if (r)
3774				return r;
3775		}
3776	}
3777
3778	/* dst.x, <- 1.0  */
3779	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3780	alu.op = ALU_OP1_MOV;
3781	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
3782	alu.src[0].chan = 0;
3783	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3784	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3785	r = r600_bytecode_add_alu(ctx->bc, &alu);
3786	if (r)
3787		return r;
3788
3789	/* dst.y = max(src.x, 0.0) */
3790	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3791	alu.op = ALU_OP2_MAX;
3792	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3793	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
3794	alu.src[1].chan = 0;
3795	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3796	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3797	r = r600_bytecode_add_alu(ctx->bc, &alu);
3798	if (r)
3799		return r;
3800
3801	/* dst.w, <- 1.0  */
3802	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3803	alu.op = ALU_OP1_MOV;
3804	alu.src[0].sel  = V_SQ_ALU_SRC_1;
3805	alu.src[0].chan = 0;
3806	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3807	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3808	alu.last = 1;
3809	r = r600_bytecode_add_alu(ctx->bc, &alu);
3810	if (r)
3811		return r;
3812
3813	return 0;
3814}
3815
3816static int tgsi_rsq(struct r600_shader_ctx *ctx)
3817{
3818	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3819	struct r600_bytecode_alu alu;
3820	int i, r;
3821
3822	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3823
3824	/* XXX:
3825	 * For state trackers other than OpenGL, we'll want to use
3826	 * _RECIPSQRT_IEEE instead.
3827	 */
3828	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3829
3830	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3831		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3832		r600_bytecode_src_set_abs(&alu.src[i]);
3833	}
3834	alu.dst.sel = ctx->temp_reg;
3835	alu.dst.write = 1;
3836	alu.last = 1;
3837	r = r600_bytecode_add_alu(ctx->bc, &alu);
3838	if (r)
3839		return r;
3840	/* replicate result */
3841	return tgsi_helper_tempx_replicate(ctx);
3842}
3843
3844static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3845{
3846	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3847	struct r600_bytecode_alu alu;
3848	int i, r;
3849
3850	for (i = 0; i < 4; i++) {
3851		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3852		alu.src[0].sel = ctx->temp_reg;
3853		alu.op = ALU_OP1_MOV;
3854		alu.dst.chan = i;
3855		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3856		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3857		if (i == 3)
3858			alu.last = 1;
3859		r = r600_bytecode_add_alu(ctx->bc, &alu);
3860		if (r)
3861			return r;
3862	}
3863	return 0;
3864}
3865
3866static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3867{
3868	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3869	struct r600_bytecode_alu alu;
3870	int i, r;
3871
3872	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3873	alu.op = ctx->inst_info->op;
3874	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3875		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3876	}
3877	alu.dst.sel = ctx->temp_reg;
3878	alu.dst.write = 1;
3879	alu.last = 1;
3880	r = r600_bytecode_add_alu(ctx->bc, &alu);
3881	if (r)
3882		return r;
3883	/* replicate result */
3884	return tgsi_helper_tempx_replicate(ctx);
3885}
3886
3887static int cayman_pow(struct r600_shader_ctx *ctx)
3888{
3889	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3890	int i, r;
3891	struct r600_bytecode_alu alu;
3892	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3893
3894	for (i = 0; i < 3; i++) {
3895		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3896		alu.op = ALU_OP1_LOG_IEEE;
3897		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3898		alu.dst.sel = ctx->temp_reg;
3899		alu.dst.chan = i;
3900		alu.dst.write = 1;
3901		if (i == 2)
3902			alu.last = 1;
3903		r = r600_bytecode_add_alu(ctx->bc, &alu);
3904		if (r)
3905			return r;
3906	}
3907
3908	/* b * LOG2(a) */
3909	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3910	alu.op = ALU_OP2_MUL;
3911	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3912	alu.src[1].sel = ctx->temp_reg;
3913	alu.dst.sel = ctx->temp_reg;
3914	alu.dst.write = 1;
3915	alu.last = 1;
3916	r = r600_bytecode_add_alu(ctx->bc, &alu);
3917	if (r)
3918		return r;
3919
3920	for (i = 0; i < last_slot; i++) {
3921		/* POW(a,b) = EXP2(b * LOG2(a))*/
3922		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3923		alu.op = ALU_OP1_EXP_IEEE;
3924		alu.src[0].sel = ctx->temp_reg;
3925
3926		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3927		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3928		if (i == last_slot - 1)
3929			alu.last = 1;
3930		r = r600_bytecode_add_alu(ctx->bc, &alu);
3931		if (r)
3932			return r;
3933	}
3934	return 0;
3935}
3936
3937static int tgsi_pow(struct r600_shader_ctx *ctx)
3938{
3939	struct r600_bytecode_alu alu;
3940	int r;
3941
3942	/* LOG2(a) */
3943	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3944	alu.op = ALU_OP1_LOG_IEEE;
3945	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3946	alu.dst.sel = ctx->temp_reg;
3947	alu.dst.write = 1;
3948	alu.last = 1;
3949	r = r600_bytecode_add_alu(ctx->bc, &alu);
3950	if (r)
3951		return r;
3952	/* b * LOG2(a) */
3953	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3954	alu.op = ALU_OP2_MUL;
3955	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3956	alu.src[1].sel = ctx->temp_reg;
3957	alu.dst.sel = ctx->temp_reg;
3958	alu.dst.write = 1;
3959	alu.last = 1;
3960	r = r600_bytecode_add_alu(ctx->bc, &alu);
3961	if (r)
3962		return r;
3963	/* POW(a,b) = EXP2(b * LOG2(a))*/
3964	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3965	alu.op = ALU_OP1_EXP_IEEE;
3966	alu.src[0].sel = ctx->temp_reg;
3967	alu.dst.sel = ctx->temp_reg;
3968	alu.dst.write = 1;
3969	alu.last = 1;
3970	r = r600_bytecode_add_alu(ctx->bc, &alu);
3971	if (r)
3972		return r;
3973	return tgsi_helper_tempx_replicate(ctx);
3974}
3975
3976static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3977{
3978	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3979	struct r600_bytecode_alu alu;
3980	int i, r, j;
3981	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3982	int tmp0 = ctx->temp_reg;
3983	int tmp1 = r600_get_temp(ctx);
3984	int tmp2 = r600_get_temp(ctx);
3985	int tmp3 = r600_get_temp(ctx);
3986	/* Unsigned path:
3987	 *
3988	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3989	 *
3990	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
3991	 * 2. tmp0.z = lo (tmp0.x * src2)
3992	 * 3. tmp0.w = -tmp0.z
3993	 * 4. tmp0.y = hi (tmp0.x * src2)
3994	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
3995	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
3996	 * 7. tmp1.x = tmp0.x - tmp0.w
3997	 * 8. tmp1.y = tmp0.x + tmp0.w
3998	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3999	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
4000	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
4001	 *
4002	 * 12. tmp0.w = src1 - tmp0.y       = r
4003	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
4004	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
4005	 *
4006	 * if DIV
4007	 *
4008	 *   15. tmp1.z = tmp0.z + 1			= q + 1
4009	 *   16. tmp1.w = tmp0.z - 1			= q - 1
4010	 *
4011	 * else MOD
4012	 *
4013	 *   15. tmp1.z = tmp0.w - src2			= r - src2
4014	 *   16. tmp1.w = tmp0.w + src2			= r + src2
4015	 *
4016	 * endif
4017	 *
4018	 * 17. tmp1.x = tmp1.x & tmp1.y
4019	 *
4020	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4021	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4022	 *
4023	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4024	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4025	 *
4026	 * Signed path:
4027	 *
4028	 * Same as unsigned, using abs values of the operands,
4029	 * and fixing the sign of the result in the end.
4030	 */
4031
4032	for (i = 0; i < 4; i++) {
4033		if (!(write_mask & (1<<i)))
4034			continue;
4035
4036		if (signed_op) {
4037
4038			/* tmp2.x = -src0 */
4039			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4040			alu.op = ALU_OP2_SUB_INT;
4041
4042			alu.dst.sel = tmp2;
4043			alu.dst.chan = 0;
4044			alu.dst.write = 1;
4045
4046			alu.src[0].sel = V_SQ_ALU_SRC_0;
4047
4048			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4049
4050			alu.last = 1;
4051			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4052				return r;
4053
4054			/* tmp2.y = -src1 */
4055			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4056			alu.op = ALU_OP2_SUB_INT;
4057
4058			alu.dst.sel = tmp2;
4059			alu.dst.chan = 1;
4060			alu.dst.write = 1;
4061
4062			alu.src[0].sel = V_SQ_ALU_SRC_0;
4063
4064			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4065
4066			alu.last = 1;
4067			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4068				return r;
4069
4070			/* tmp2.z sign bit is set if src0 and src2 signs are different */
4071			/* it will be a sign of the quotient */
4072			if (!mod) {
4073
4074				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4075				alu.op = ALU_OP2_XOR_INT;
4076
4077				alu.dst.sel = tmp2;
4078				alu.dst.chan = 2;
4079				alu.dst.write = 1;
4080
4081				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4082				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4083
4084				alu.last = 1;
4085				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4086					return r;
4087			}
4088
4089			/* tmp2.x = |src0| */
4090			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4091			alu.op = ALU_OP3_CNDGE_INT;
4092			alu.is_op3 = 1;
4093
4094			alu.dst.sel = tmp2;
4095			alu.dst.chan = 0;
4096			alu.dst.write = 1;
4097
4098			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4099			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4100			alu.src[2].sel = tmp2;
4101			alu.src[2].chan = 0;
4102
4103			alu.last = 1;
4104			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4105				return r;
4106
4107			/* tmp2.y = |src1| */
4108			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4109			alu.op = ALU_OP3_CNDGE_INT;
4110			alu.is_op3 = 1;
4111
4112			alu.dst.sel = tmp2;
4113			alu.dst.chan = 1;
4114			alu.dst.write = 1;
4115
4116			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4117			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4118			alu.src[2].sel = tmp2;
4119			alu.src[2].chan = 1;
4120
4121			alu.last = 1;
4122			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4123				return r;
4124
4125		}
4126
4127		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
4128		if (ctx->bc->chip_class == CAYMAN) {
4129			/* tmp3.x = u2f(src2) */
4130			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4131			alu.op = ALU_OP1_UINT_TO_FLT;
4132
4133			alu.dst.sel = tmp3;
4134			alu.dst.chan = 0;
4135			alu.dst.write = 1;
4136
4137			if (signed_op) {
4138				alu.src[0].sel = tmp2;
4139				alu.src[0].chan = 1;
4140			} else {
4141				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4142			}
4143
4144			alu.last = 1;
4145			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4146				return r;
4147
4148			/* tmp0.x = recip(tmp3.x) */
4149			for (j = 0 ; j < 3; j++) {
4150				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4151				alu.op = ALU_OP1_RECIP_IEEE;
4152
4153				alu.dst.sel = tmp0;
4154				alu.dst.chan = j;
4155				alu.dst.write = (j == 0);
4156
4157				alu.src[0].sel = tmp3;
4158				alu.src[0].chan = 0;
4159
4160				if (j == 2)
4161					alu.last = 1;
4162				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4163					return r;
4164			}
4165
4166			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4167			alu.op = ALU_OP2_MUL;
4168
4169			alu.src[0].sel = tmp0;
4170			alu.src[0].chan = 0;
4171
4172			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4173			alu.src[1].value = 0x4f800000;
4174
4175			alu.dst.sel = tmp3;
4176			alu.dst.write = 1;
4177			alu.last = 1;
4178			r = r600_bytecode_add_alu(ctx->bc, &alu);
4179			if (r)
4180				return r;
4181
4182			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4183			alu.op = ALU_OP1_FLT_TO_UINT;
4184
4185			alu.dst.sel = tmp0;
4186			alu.dst.chan = 0;
4187			alu.dst.write = 1;
4188
4189			alu.src[0].sel = tmp3;
4190			alu.src[0].chan = 0;
4191
4192			alu.last = 1;
4193			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4194				return r;
4195
4196		} else {
4197			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4198			alu.op = ALU_OP1_RECIP_UINT;
4199
4200			alu.dst.sel = tmp0;
4201			alu.dst.chan = 0;
4202			alu.dst.write = 1;
4203
4204			if (signed_op) {
4205				alu.src[0].sel = tmp2;
4206				alu.src[0].chan = 1;
4207			} else {
4208				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4209			}
4210
4211			alu.last = 1;
4212			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4213				return r;
4214		}
4215
4216		/* 2. tmp0.z = lo (tmp0.x * src2) */
4217		if (ctx->bc->chip_class == CAYMAN) {
4218			for (j = 0 ; j < 4; j++) {
4219				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4220				alu.op = ALU_OP2_MULLO_UINT;
4221
4222				alu.dst.sel = tmp0;
4223				alu.dst.chan = j;
4224				alu.dst.write = (j == 2);
4225
4226				alu.src[0].sel = tmp0;
4227				alu.src[0].chan = 0;
4228				if (signed_op) {
4229					alu.src[1].sel = tmp2;
4230					alu.src[1].chan = 1;
4231				} else {
4232					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4233				}
4234
4235				alu.last = (j == 3);
4236				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4237					return r;
4238			}
4239		} else {
4240			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4241			alu.op = ALU_OP2_MULLO_UINT;
4242
4243			alu.dst.sel = tmp0;
4244			alu.dst.chan = 2;
4245			alu.dst.write = 1;
4246
4247			alu.src[0].sel = tmp0;
4248			alu.src[0].chan = 0;
4249			if (signed_op) {
4250				alu.src[1].sel = tmp2;
4251				alu.src[1].chan = 1;
4252			} else {
4253				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4254			}
4255
4256			alu.last = 1;
4257			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4258				return r;
4259		}
4260
4261		/* 3. tmp0.w = -tmp0.z */
4262		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4263		alu.op = ALU_OP2_SUB_INT;
4264
4265		alu.dst.sel = tmp0;
4266		alu.dst.chan = 3;
4267		alu.dst.write = 1;
4268
4269		alu.src[0].sel = V_SQ_ALU_SRC_0;
4270		alu.src[1].sel = tmp0;
4271		alu.src[1].chan = 2;
4272
4273		alu.last = 1;
4274		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4275			return r;
4276
4277		/* 4. tmp0.y = hi (tmp0.x * src2) */
4278		if (ctx->bc->chip_class == CAYMAN) {
4279			for (j = 0 ; j < 4; j++) {
4280				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4281				alu.op = ALU_OP2_MULHI_UINT;
4282
4283				alu.dst.sel = tmp0;
4284				alu.dst.chan = j;
4285				alu.dst.write = (j == 1);
4286
4287				alu.src[0].sel = tmp0;
4288				alu.src[0].chan = 0;
4289
4290				if (signed_op) {
4291					alu.src[1].sel = tmp2;
4292					alu.src[1].chan = 1;
4293				} else {
4294					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4295				}
4296				alu.last = (j == 3);
4297				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4298					return r;
4299			}
4300		} else {
4301			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4302			alu.op = ALU_OP2_MULHI_UINT;
4303
4304			alu.dst.sel = tmp0;
4305			alu.dst.chan = 1;
4306			alu.dst.write = 1;
4307
4308			alu.src[0].sel = tmp0;
4309			alu.src[0].chan = 0;
4310
4311			if (signed_op) {
4312				alu.src[1].sel = tmp2;
4313				alu.src[1].chan = 1;
4314			} else {
4315				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4316			}
4317
4318			alu.last = 1;
4319			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4320				return r;
4321		}
4322
4323		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
4324		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4325		alu.op = ALU_OP3_CNDE_INT;
4326		alu.is_op3 = 1;
4327
4328		alu.dst.sel = tmp0;
4329		alu.dst.chan = 2;
4330		alu.dst.write = 1;
4331
4332		alu.src[0].sel = tmp0;
4333		alu.src[0].chan = 1;
4334		alu.src[1].sel = tmp0;
4335		alu.src[1].chan = 3;
4336		alu.src[2].sel = tmp0;
4337		alu.src[2].chan = 2;
4338
4339		alu.last = 1;
4340		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4341			return r;
4342
4343		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
4344		if (ctx->bc->chip_class == CAYMAN) {
4345			for (j = 0 ; j < 4; j++) {
4346				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4347				alu.op = ALU_OP2_MULHI_UINT;
4348
4349				alu.dst.sel = tmp0;
4350				alu.dst.chan = j;
4351				alu.dst.write = (j == 3);
4352
4353				alu.src[0].sel = tmp0;
4354				alu.src[0].chan = 2;
4355
4356				alu.src[1].sel = tmp0;
4357				alu.src[1].chan = 0;
4358
4359				alu.last = (j == 3);
4360				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4361					return r;
4362			}
4363		} else {
4364			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4365			alu.op = ALU_OP2_MULHI_UINT;
4366
4367			alu.dst.sel = tmp0;
4368			alu.dst.chan = 3;
4369			alu.dst.write = 1;
4370
4371			alu.src[0].sel = tmp0;
4372			alu.src[0].chan = 2;
4373
4374			alu.src[1].sel = tmp0;
4375			alu.src[1].chan = 0;
4376
4377			alu.last = 1;
4378			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4379				return r;
4380		}
4381
4382		/* 7. tmp1.x = tmp0.x - tmp0.w */
4383		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4384		alu.op = ALU_OP2_SUB_INT;
4385
4386		alu.dst.sel = tmp1;
4387		alu.dst.chan = 0;
4388		alu.dst.write = 1;
4389
4390		alu.src[0].sel = tmp0;
4391		alu.src[0].chan = 0;
4392		alu.src[1].sel = tmp0;
4393		alu.src[1].chan = 3;
4394
4395		alu.last = 1;
4396		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4397			return r;
4398
4399		/* 8. tmp1.y = tmp0.x + tmp0.w */
4400		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4401		alu.op = ALU_OP2_ADD_INT;
4402
4403		alu.dst.sel = tmp1;
4404		alu.dst.chan = 1;
4405		alu.dst.write = 1;
4406
4407		alu.src[0].sel = tmp0;
4408		alu.src[0].chan = 0;
4409		alu.src[1].sel = tmp0;
4410		alu.src[1].chan = 3;
4411
4412		alu.last = 1;
4413		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4414			return r;
4415
4416		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4417		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4418		alu.op = ALU_OP3_CNDE_INT;
4419		alu.is_op3 = 1;
4420
4421		alu.dst.sel = tmp0;
4422		alu.dst.chan = 0;
4423		alu.dst.write = 1;
4424
4425		alu.src[0].sel = tmp0;
4426		alu.src[0].chan = 1;
4427		alu.src[1].sel = tmp1;
4428		alu.src[1].chan = 1;
4429		alu.src[2].sel = tmp1;
4430		alu.src[2].chan = 0;
4431
4432		alu.last = 1;
4433		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4434			return r;
4435
4436		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
4437		if (ctx->bc->chip_class == CAYMAN) {
4438			for (j = 0 ; j < 4; j++) {
4439				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4440				alu.op = ALU_OP2_MULHI_UINT;
4441
4442				alu.dst.sel = tmp0;
4443				alu.dst.chan = j;
4444				alu.dst.write = (j == 2);
4445
4446				alu.src[0].sel = tmp0;
4447				alu.src[0].chan = 0;
4448
4449				if (signed_op) {
4450					alu.src[1].sel = tmp2;
4451					alu.src[1].chan = 0;
4452				} else {
4453					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4454				}
4455
4456				alu.last = (j == 3);
4457				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4458					return r;
4459			}
4460		} else {
4461			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4462			alu.op = ALU_OP2_MULHI_UINT;
4463
4464			alu.dst.sel = tmp0;
4465			alu.dst.chan = 2;
4466			alu.dst.write = 1;
4467
4468			alu.src[0].sel = tmp0;
4469			alu.src[0].chan = 0;
4470
4471			if (signed_op) {
4472				alu.src[1].sel = tmp2;
4473				alu.src[1].chan = 0;
4474			} else {
4475				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4476			}
4477
4478			alu.last = 1;
4479			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4480				return r;
4481		}
4482
4483		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
4484		if (ctx->bc->chip_class == CAYMAN) {
4485			for (j = 0 ; j < 4; j++) {
4486				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4487				alu.op = ALU_OP2_MULLO_UINT;
4488
4489				alu.dst.sel = tmp0;
4490				alu.dst.chan = j;
4491				alu.dst.write = (j == 1);
4492
4493				if (signed_op) {
4494					alu.src[0].sel = tmp2;
4495					alu.src[0].chan = 1;
4496				} else {
4497					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4498				}
4499
4500				alu.src[1].sel = tmp0;
4501				alu.src[1].chan = 2;
4502
4503				alu.last = (j == 3);
4504				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4505					return r;
4506			}
4507		} else {
4508			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4509			alu.op = ALU_OP2_MULLO_UINT;
4510
4511			alu.dst.sel = tmp0;
4512			alu.dst.chan = 1;
4513			alu.dst.write = 1;
4514
4515			if (signed_op) {
4516				alu.src[0].sel = tmp2;
4517				alu.src[0].chan = 1;
4518			} else {
4519				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4520			}
4521
4522			alu.src[1].sel = tmp0;
4523			alu.src[1].chan = 2;
4524
4525			alu.last = 1;
4526			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4527				return r;
4528		}
4529
4530		/* 12. tmp0.w = src1 - tmp0.y       = r */
4531		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4532		alu.op = ALU_OP2_SUB_INT;
4533
4534		alu.dst.sel = tmp0;
4535		alu.dst.chan = 3;
4536		alu.dst.write = 1;
4537
4538		if (signed_op) {
4539			alu.src[0].sel = tmp2;
4540			alu.src[0].chan = 0;
4541		} else {
4542			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4543		}
4544
4545		alu.src[1].sel = tmp0;
4546		alu.src[1].chan = 1;
4547
4548		alu.last = 1;
4549		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4550			return r;
4551
4552		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
4553		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4554		alu.op = ALU_OP2_SETGE_UINT;
4555
4556		alu.dst.sel = tmp1;
4557		alu.dst.chan = 0;
4558		alu.dst.write = 1;
4559
4560		alu.src[0].sel = tmp0;
4561		alu.src[0].chan = 3;
4562		if (signed_op) {
4563			alu.src[1].sel = tmp2;
4564			alu.src[1].chan = 1;
4565		} else {
4566			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4567		}
4568
4569		alu.last = 1;
4570		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4571			return r;
4572
4573		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
4574		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4575		alu.op = ALU_OP2_SETGE_UINT;
4576
4577		alu.dst.sel = tmp1;
4578		alu.dst.chan = 1;
4579		alu.dst.write = 1;
4580
4581		if (signed_op) {
4582			alu.src[0].sel = tmp2;
4583			alu.src[0].chan = 0;
4584		} else {
4585			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4586		}
4587
4588		alu.src[1].sel = tmp0;
4589		alu.src[1].chan = 1;
4590
4591		alu.last = 1;
4592		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4593			return r;
4594
4595		if (mod) { /* UMOD */
4596
4597			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
4598			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4599			alu.op = ALU_OP2_SUB_INT;
4600
4601			alu.dst.sel = tmp1;
4602			alu.dst.chan = 2;
4603			alu.dst.write = 1;
4604
4605			alu.src[0].sel = tmp0;
4606			alu.src[0].chan = 3;
4607
4608			if (signed_op) {
4609				alu.src[1].sel = tmp2;
4610				alu.src[1].chan = 1;
4611			} else {
4612				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4613			}
4614
4615			alu.last = 1;
4616			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4617				return r;
4618
4619			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
4620			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4621			alu.op = ALU_OP2_ADD_INT;
4622
4623			alu.dst.sel = tmp1;
4624			alu.dst.chan = 3;
4625			alu.dst.write = 1;
4626
4627			alu.src[0].sel = tmp0;
4628			alu.src[0].chan = 3;
4629			if (signed_op) {
4630				alu.src[1].sel = tmp2;
4631				alu.src[1].chan = 1;
4632			} else {
4633				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4634			}
4635
4636			alu.last = 1;
4637			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4638				return r;
4639
4640		} else { /* UDIV */
4641
4642			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
4643			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4644			alu.op = ALU_OP2_ADD_INT;
4645
4646			alu.dst.sel = tmp1;
4647			alu.dst.chan = 2;
4648			alu.dst.write = 1;
4649
4650			alu.src[0].sel = tmp0;
4651			alu.src[0].chan = 2;
4652			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4653
4654			alu.last = 1;
4655			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4656				return r;
4657
4658			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
4659			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4660			alu.op = ALU_OP2_ADD_INT;
4661
4662			alu.dst.sel = tmp1;
4663			alu.dst.chan = 3;
4664			alu.dst.write = 1;
4665
4666			alu.src[0].sel = tmp0;
4667			alu.src[0].chan = 2;
4668			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4669
4670			alu.last = 1;
4671			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4672				return r;
4673
4674		}
4675
4676		/* 17. tmp1.x = tmp1.x & tmp1.y */
4677		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4678		alu.op = ALU_OP2_AND_INT;
4679
4680		alu.dst.sel = tmp1;
4681		alu.dst.chan = 0;
4682		alu.dst.write = 1;
4683
4684		alu.src[0].sel = tmp1;
4685		alu.src[0].chan = 0;
4686		alu.src[1].sel = tmp1;
4687		alu.src[1].chan = 1;
4688
4689		alu.last = 1;
4690		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4691			return r;
4692
4693		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
4694		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
4695		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4696		alu.op = ALU_OP3_CNDE_INT;
4697		alu.is_op3 = 1;
4698
4699		alu.dst.sel = tmp0;
4700		alu.dst.chan = 2;
4701		alu.dst.write = 1;
4702
4703		alu.src[0].sel = tmp1;
4704		alu.src[0].chan = 0;
4705		alu.src[1].sel = tmp0;
4706		alu.src[1].chan = mod ? 3 : 2;
4707		alu.src[2].sel = tmp1;
4708		alu.src[2].chan = 2;
4709
4710		alu.last = 1;
4711		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4712			return r;
4713
4714		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4715		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4716		alu.op = ALU_OP3_CNDE_INT;
4717		alu.is_op3 = 1;
4718
4719		if (signed_op) {
4720			alu.dst.sel = tmp0;
4721			alu.dst.chan = 2;
4722			alu.dst.write = 1;
4723		} else {
4724			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4725		}
4726
4727		alu.src[0].sel = tmp1;
4728		alu.src[0].chan = 1;
4729		alu.src[1].sel = tmp1;
4730		alu.src[1].chan = 3;
4731		alu.src[2].sel = tmp0;
4732		alu.src[2].chan = 2;
4733
4734		alu.last = 1;
4735		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4736			return r;
4737
4738		if (signed_op) {
4739
4740			/* fix the sign of the result */
4741
4742			if (mod) {
4743
4744				/* tmp0.x = -tmp0.z */
4745				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4746				alu.op = ALU_OP2_SUB_INT;
4747
4748				alu.dst.sel = tmp0;
4749				alu.dst.chan = 0;
4750				alu.dst.write = 1;
4751
4752				alu.src[0].sel = V_SQ_ALU_SRC_0;
4753				alu.src[1].sel = tmp0;
4754				alu.src[1].chan = 2;
4755
4756				alu.last = 1;
4757				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4758					return r;
4759
4760				/* sign of the remainder is the same as the sign of src0 */
4761				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4762				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4763				alu.op = ALU_OP3_CNDGE_INT;
4764				alu.is_op3 = 1;
4765
4766				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4767
4768				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4769				alu.src[1].sel = tmp0;
4770				alu.src[1].chan = 2;
4771				alu.src[2].sel = tmp0;
4772				alu.src[2].chan = 0;
4773
4774				alu.last = 1;
4775				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4776					return r;
4777
4778			} else {
4779
4780				/* tmp0.x = -tmp0.z */
4781				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4782				alu.op = ALU_OP2_SUB_INT;
4783
4784				alu.dst.sel = tmp0;
4785				alu.dst.chan = 0;
4786				alu.dst.write = 1;
4787
4788				alu.src[0].sel = V_SQ_ALU_SRC_0;
4789				alu.src[1].sel = tmp0;
4790				alu.src[1].chan = 2;
4791
4792				alu.last = 1;
4793				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4794					return r;
4795
4796				/* fix the quotient sign (same as the sign of src0*src1) */
4797				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4798				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4799				alu.op = ALU_OP3_CNDGE_INT;
4800				alu.is_op3 = 1;
4801
4802				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4803
4804				alu.src[0].sel = tmp2;
4805				alu.src[0].chan = 2;
4806				alu.src[1].sel = tmp0;
4807				alu.src[1].chan = 2;
4808				alu.src[2].sel = tmp0;
4809				alu.src[2].chan = 0;
4810
4811				alu.last = 1;
4812				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4813					return r;
4814			}
4815		}
4816	}
4817	return 0;
4818}
4819
4820static int tgsi_udiv(struct r600_shader_ctx *ctx)
4821{
4822	return tgsi_divmod(ctx, 0, 0);
4823}
4824
4825static int tgsi_umod(struct r600_shader_ctx *ctx)
4826{
4827	return tgsi_divmod(ctx, 1, 0);
4828}
4829
4830static int tgsi_idiv(struct r600_shader_ctx *ctx)
4831{
4832	return tgsi_divmod(ctx, 0, 1);
4833}
4834
4835static int tgsi_imod(struct r600_shader_ctx *ctx)
4836{
4837	return tgsi_divmod(ctx, 1, 1);
4838}
4839
4840
4841static int tgsi_f2i(struct r600_shader_ctx *ctx)
4842{
4843	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4844	struct r600_bytecode_alu alu;
4845	int i, r;
4846	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4847	int last_inst = tgsi_last_instruction(write_mask);
4848
4849	for (i = 0; i < 4; i++) {
4850		if (!(write_mask & (1<<i)))
4851			continue;
4852
4853		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4854		alu.op = ALU_OP1_TRUNC;
4855
4856		alu.dst.sel = ctx->temp_reg;
4857		alu.dst.chan = i;
4858		alu.dst.write = 1;
4859
4860		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4861		if (i == last_inst)
4862			alu.last = 1;
4863		r = r600_bytecode_add_alu(ctx->bc, &alu);
4864		if (r)
4865			return r;
4866	}
4867
4868	for (i = 0; i < 4; i++) {
4869		if (!(write_mask & (1<<i)))
4870			continue;
4871
4872		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4873		alu.op = ctx->inst_info->op;
4874
4875		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4876
4877		alu.src[0].sel = ctx->temp_reg;
4878		alu.src[0].chan = i;
4879
4880		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4881			alu.last = 1;
4882		r = r600_bytecode_add_alu(ctx->bc, &alu);
4883		if (r)
4884			return r;
4885	}
4886
4887	return 0;
4888}
4889
4890static int tgsi_iabs(struct r600_shader_ctx *ctx)
4891{
4892	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4893	struct r600_bytecode_alu alu;
4894	int i, r;
4895	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4896	int last_inst = tgsi_last_instruction(write_mask);
4897
4898	/* tmp = -src */
4899	for (i = 0; i < 4; i++) {
4900		if (!(write_mask & (1<<i)))
4901			continue;
4902
4903		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4904		alu.op = ALU_OP2_SUB_INT;
4905
4906		alu.dst.sel = ctx->temp_reg;
4907		alu.dst.chan = i;
4908		alu.dst.write = 1;
4909
4910		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4911		alu.src[0].sel = V_SQ_ALU_SRC_0;
4912
4913		if (i == last_inst)
4914			alu.last = 1;
4915		r = r600_bytecode_add_alu(ctx->bc, &alu);
4916		if (r)
4917			return r;
4918	}
4919
4920	/* dst = (src >= 0 ? src : tmp) */
4921	for (i = 0; i < 4; i++) {
4922		if (!(write_mask & (1<<i)))
4923			continue;
4924
4925		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4926		alu.op = ALU_OP3_CNDGE_INT;
4927		alu.is_op3 = 1;
4928		alu.dst.write = 1;
4929
4930		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4931
4932		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4933		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4934		alu.src[2].sel = ctx->temp_reg;
4935		alu.src[2].chan = i;
4936
4937		if (i == last_inst)
4938			alu.last = 1;
4939		r = r600_bytecode_add_alu(ctx->bc, &alu);
4940		if (r)
4941			return r;
4942	}
4943	return 0;
4944}
4945
4946static int tgsi_issg(struct r600_shader_ctx *ctx)
4947{
4948	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4949	struct r600_bytecode_alu alu;
4950	int i, r;
4951	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4952	int last_inst = tgsi_last_instruction(write_mask);
4953
4954	/* tmp = (src >= 0 ? src : -1) */
4955	for (i = 0; i < 4; i++) {
4956		if (!(write_mask & (1<<i)))
4957			continue;
4958
4959		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4960		alu.op = ALU_OP3_CNDGE_INT;
4961		alu.is_op3 = 1;
4962
4963		alu.dst.sel = ctx->temp_reg;
4964		alu.dst.chan = i;
4965		alu.dst.write = 1;
4966
4967		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4968		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4969		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4970
4971		if (i == last_inst)
4972			alu.last = 1;
4973		r = r600_bytecode_add_alu(ctx->bc, &alu);
4974		if (r)
4975			return r;
4976	}
4977
4978	/* dst = (tmp > 0 ? 1 : tmp) */
4979	for (i = 0; i < 4; i++) {
4980		if (!(write_mask & (1<<i)))
4981			continue;
4982
4983		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4984		alu.op = ALU_OP3_CNDGT_INT;
4985		alu.is_op3 = 1;
4986		alu.dst.write = 1;
4987
4988		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4989
4990		alu.src[0].sel = ctx->temp_reg;
4991		alu.src[0].chan = i;
4992
4993		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4994
4995		alu.src[2].sel = ctx->temp_reg;
4996		alu.src[2].chan = i;
4997
4998		if (i == last_inst)
4999			alu.last = 1;
5000		r = r600_bytecode_add_alu(ctx->bc, &alu);
5001		if (r)
5002			return r;
5003	}
5004	return 0;
5005}
5006
5007
5008
5009static int tgsi_ssg(struct r600_shader_ctx *ctx)
5010{
5011	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5012	struct r600_bytecode_alu alu;
5013	int i, r;
5014
5015	/* tmp = (src > 0 ? 1 : src) */
5016	for (i = 0; i < 4; i++) {
5017		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5018		alu.op = ALU_OP3_CNDGT;
5019		alu.is_op3 = 1;
5020
5021		alu.dst.sel = ctx->temp_reg;
5022		alu.dst.chan = i;
5023
5024		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5025		alu.src[1].sel = V_SQ_ALU_SRC_1;
5026		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5027
5028		if (i == 3)
5029			alu.last = 1;
5030		r = r600_bytecode_add_alu(ctx->bc, &alu);
5031		if (r)
5032			return r;
5033	}
5034
5035	/* dst = (-tmp > 0 ? -1 : tmp) */
5036	for (i = 0; i < 4; i++) {
5037		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5038		alu.op = ALU_OP3_CNDGT;
5039		alu.is_op3 = 1;
5040		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5041
5042		alu.src[0].sel = ctx->temp_reg;
5043		alu.src[0].chan = i;
5044		alu.src[0].neg = 1;
5045
5046		alu.src[1].sel = V_SQ_ALU_SRC_1;
5047		alu.src[1].neg = 1;
5048
5049		alu.src[2].sel = ctx->temp_reg;
5050		alu.src[2].chan = i;
5051
5052		if (i == 3)
5053			alu.last = 1;
5054		r = r600_bytecode_add_alu(ctx->bc, &alu);
5055		if (r)
5056			return r;
5057	}
5058	return 0;
5059}
5060
5061static int tgsi_bfi(struct r600_shader_ctx *ctx)
5062{
5063	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5064	struct r600_bytecode_alu alu;
5065	int i, r, t1, t2;
5066
5067	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5068	int last_inst = tgsi_last_instruction(write_mask);
5069
5070	t1 = ctx->temp_reg;
5071
5072	for (i = 0; i < 4; i++) {
5073		if (!(write_mask & (1<<i)))
5074			continue;
5075
5076		/* create mask tmp */
5077		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5078		alu.op = ALU_OP2_BFM_INT;
5079		alu.dst.sel = t1;
5080		alu.dst.chan = i;
5081		alu.dst.write = 1;
5082		alu.last = i == last_inst;
5083
5084		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5085		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5086
5087		r = r600_bytecode_add_alu(ctx->bc, &alu);
5088		if (r)
5089			return r;
5090	}
5091
5092	t2 = r600_get_temp(ctx);
5093
5094	for (i = 0; i < 4; i++) {
5095		if (!(write_mask & (1<<i)))
5096			continue;
5097
5098		/* shift insert left */
5099		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5100		alu.op = ALU_OP2_LSHL_INT;
5101		alu.dst.sel = t2;
5102		alu.dst.chan = i;
5103		alu.dst.write = 1;
5104		alu.last = i == last_inst;
5105
5106		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5107		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5108
5109		r = r600_bytecode_add_alu(ctx->bc, &alu);
5110		if (r)
5111			return r;
5112	}
5113
5114	for (i = 0; i < 4; i++) {
5115		if (!(write_mask & (1<<i)))
5116			continue;
5117
5118		/* actual bitfield insert */
5119		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5120		alu.op = ALU_OP3_BFI_INT;
5121		alu.is_op3 = 1;
5122		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5123		alu.dst.chan = i;
5124		alu.dst.write = 1;
5125		alu.last = i == last_inst;
5126
5127		alu.src[0].sel = t1;
5128		alu.src[0].chan = i;
5129		alu.src[1].sel = t2;
5130		alu.src[1].chan = i;
5131		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5132
5133		r = r600_bytecode_add_alu(ctx->bc, &alu);
5134		if (r)
5135			return r;
5136	}
5137
5138	return 0;
5139}
5140
5141static int tgsi_msb(struct r600_shader_ctx *ctx)
5142{
5143	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5144	struct r600_bytecode_alu alu;
5145	int i, r, t1, t2;
5146
5147	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5148	int last_inst = tgsi_last_instruction(write_mask);
5149
5150	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5151		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5152
5153	t1 = ctx->temp_reg;
5154
5155	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5156	for (i = 0; i < 4; i++) {
5157		if (!(write_mask & (1<<i)))
5158			continue;
5159
5160		/* t1 = FFBH_INT / FFBH_UINT */
5161		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5162		alu.op = ctx->inst_info->op;
5163		alu.dst.sel = t1;
5164		alu.dst.chan = i;
5165		alu.dst.write = 1;
5166		alu.last = i == last_inst;
5167
5168		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5169
5170		r = r600_bytecode_add_alu(ctx->bc, &alu);
5171		if (r)
5172			return r;
5173	}
5174
5175	t2 = r600_get_temp(ctx);
5176
5177	for (i = 0; i < 4; i++) {
5178		if (!(write_mask & (1<<i)))
5179			continue;
5180
5181		/* t2 = 31 - t1 */
5182		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183		alu.op = ALU_OP2_SUB_INT;
5184		alu.dst.sel = t2;
5185		alu.dst.chan = i;
5186		alu.dst.write = 1;
5187		alu.last = i == last_inst;
5188
5189		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5190		alu.src[0].value = 31;
5191		alu.src[1].sel = t1;
5192		alu.src[1].chan = i;
5193
5194		r = r600_bytecode_add_alu(ctx->bc, &alu);
5195		if (r)
5196			return r;
5197	}
5198
5199	for (i = 0; i < 4; i++) {
5200		if (!(write_mask & (1<<i)))
5201			continue;
5202
5203		/* result = t1 >= 0 ? t2 : t1 */
5204		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5205		alu.op = ALU_OP3_CNDGE_INT;
5206		alu.is_op3 = 1;
5207		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5208		alu.dst.chan = i;
5209		alu.dst.write = 1;
5210		alu.last = i == last_inst;
5211
5212		alu.src[0].sel = t1;
5213		alu.src[0].chan = i;
5214		alu.src[1].sel = t2;
5215		alu.src[1].chan = i;
5216		alu.src[2].sel = t1;
5217		alu.src[2].chan = i;
5218
5219		r = r600_bytecode_add_alu(ctx->bc, &alu);
5220		if (r)
5221			return r;
5222	}
5223
5224	return 0;
5225}
5226
5227static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5228{
5229	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5230	struct r600_bytecode_alu alu;
5231	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5232	unsigned location;
5233	int input;
5234
5235	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5236
5237	input = inst->Src[0].Register.Index;
5238
5239	/* Interpolators have been marked for use already by allocate_system_value_inputs */
5240	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5241		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5242		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5243	}
5244	else {
5245		location = TGSI_INTERPOLATE_LOC_CENTROID;
5246	}
5247
5248	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5249	if (k < 0)
5250		k = 0;
5251	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5252	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5253
5254	/* NOTE: currently offset is not perspective correct */
5255	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5256		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5257		int sample_gpr = -1;
5258		int gradientsH, gradientsV;
5259		struct r600_bytecode_tex tex;
5260
5261		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5262			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5263		}
5264
5265		gradientsH = r600_get_temp(ctx);
5266		gradientsV = r600_get_temp(ctx);
5267		for (i = 0; i < 2; i++) {
5268			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5269			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5270			tex.src_gpr = interp_gpr;
5271			tex.src_sel_x = interp_base_chan + 0;
5272			tex.src_sel_y = interp_base_chan + 1;
5273			tex.src_sel_z = 0;
5274			tex.src_sel_w = 0;
5275			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5276			tex.dst_sel_x = 0;
5277			tex.dst_sel_y = 1;
5278			tex.dst_sel_z = 7;
5279			tex.dst_sel_w = 7;
5280			tex.inst_mod = 1; // Use per pixel gradient calculation
5281			tex.sampler_id = 0;
5282			tex.resource_id = tex.sampler_id;
5283			r = r600_bytecode_add_tex(ctx->bc, &tex);
5284			if (r)
5285				return r;
5286		}
5287
5288		for (i = 0; i < 2; i++) {
5289			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5290			alu.op = ALU_OP3_MULADD;
5291			alu.is_op3 = 1;
5292			alu.src[0].sel = gradientsH;
5293			alu.src[0].chan = i;
5294			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5295				alu.src[1].sel = sample_gpr;
5296				alu.src[1].chan = 2;
5297			}
5298			else {
5299				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5300			}
5301			alu.src[2].sel = interp_gpr;
5302			alu.src[2].chan = interp_base_chan + i;
5303			alu.dst.sel = ctx->temp_reg;
5304			alu.dst.chan = i;
5305			alu.last = i == 1;
5306
5307			r = r600_bytecode_add_alu(ctx->bc, &alu);
5308			if (r)
5309				return r;
5310		}
5311
5312		for (i = 0; i < 2; i++) {
5313			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5314			alu.op = ALU_OP3_MULADD;
5315			alu.is_op3 = 1;
5316			alu.src[0].sel = gradientsV;
5317			alu.src[0].chan = i;
5318			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5319				alu.src[1].sel = sample_gpr;
5320				alu.src[1].chan = 3;
5321			}
5322			else {
5323				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5324			}
5325			alu.src[2].sel = ctx->temp_reg;
5326			alu.src[2].chan = i;
5327			alu.dst.sel = ctx->temp_reg;
5328			alu.dst.chan = i;
5329			alu.last = i == 1;
5330
5331			r = r600_bytecode_add_alu(ctx->bc, &alu);
5332			if (r)
5333				return r;
5334		}
5335	}
5336
5337	tmp = r600_get_temp(ctx);
5338	for (i = 0; i < 8; i++) {
5339		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5340		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5341
5342		alu.dst.sel = tmp;
5343		if ((i > 1 && i < 6)) {
5344			alu.dst.write = 1;
5345		}
5346		else {
5347			alu.dst.write = 0;
5348		}
5349		alu.dst.chan = i % 4;
5350
5351		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5352			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5353			alu.src[0].sel = ctx->temp_reg;
5354			alu.src[0].chan = 1 - (i % 2);
5355		} else {
5356			alu.src[0].sel = interp_gpr;
5357			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5358		}
5359		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5360		alu.src[1].chan = 0;
5361
5362		alu.last = i % 4 == 3;
5363		alu.bank_swizzle_force = SQ_ALU_VEC_210;
5364
5365		r = r600_bytecode_add_alu(ctx->bc, &alu);
5366		if (r)
5367			return r;
5368	}
5369
5370	// INTERP can't swizzle dst
5371	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5372	for (i = 0; i <= lasti; i++) {
5373		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5374			continue;
5375
5376		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5377		alu.op = ALU_OP1_MOV;
5378		alu.src[0].sel = tmp;
5379		alu.src[0].chan = ctx->src[0].swizzle[i];
5380		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5381		alu.dst.write = 1;
5382		alu.last = i == lasti;
5383		r = r600_bytecode_add_alu(ctx->bc, &alu);
5384		if (r)
5385			return r;
5386	}
5387
5388	return 0;
5389}
5390
5391
5392static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5393{
5394	struct r600_bytecode_alu alu;
5395	int i, r;
5396
5397	for (i = 0; i < 4; i++) {
5398		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5399		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5400			alu.op = ALU_OP0_NOP;
5401			alu.dst.chan = i;
5402		} else {
5403			alu.op = ALU_OP1_MOV;
5404			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5405			alu.src[0].sel = ctx->temp_reg;
5406			alu.src[0].chan = i;
5407		}
5408		if (i == 3) {
5409			alu.last = 1;
5410		}
5411		r = r600_bytecode_add_alu(ctx->bc, &alu);
5412		if (r)
5413			return r;
5414	}
5415	return 0;
5416}
5417
5418static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5419                                 unsigned temp, int chan,
5420                                 struct r600_bytecode_alu_src *bc_src,
5421                                 const struct r600_shader_src *shader_src)
5422{
5423	struct r600_bytecode_alu alu;
5424	int r;
5425
5426	r600_bytecode_src(bc_src, shader_src, chan);
5427
5428	/* op3 operands don't support abs modifier */
5429	if (bc_src->abs) {
5430		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
5431		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5432		alu.op = ALU_OP1_MOV;
5433		alu.dst.sel = temp;
5434		alu.dst.chan = chan;
5435		alu.dst.write = 1;
5436
5437		alu.src[0] = *bc_src;
5438		alu.last = true; // sufficient?
5439		r = r600_bytecode_add_alu(ctx->bc, &alu);
5440		if (r)
5441			return r;
5442
5443		memset(bc_src, 0, sizeof(*bc_src));
5444		bc_src->sel = temp;
5445		bc_src->chan = chan;
5446	}
5447	return 0;
5448}
5449
5450static int tgsi_op3(struct r600_shader_ctx *ctx)
5451{
5452	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5453	struct r600_bytecode_alu alu;
5454	int i, j, r;
5455	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5456	int temp_regs[4];
5457
5458	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5459		temp_regs[j] = 0;
5460		if (ctx->src[j].abs)
5461			temp_regs[j] = r600_get_temp(ctx);
5462	}
5463	for (i = 0; i < lasti + 1; i++) {
5464		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5465			continue;
5466
5467		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5468		alu.op = ctx->inst_info->op;
5469		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5470			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5471			if (r)
5472				return r;
5473		}
5474
5475		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5476		alu.dst.chan = i;
5477		alu.dst.write = 1;
5478		alu.is_op3 = 1;
5479		if (i == lasti) {
5480			alu.last = 1;
5481		}
5482		r = r600_bytecode_add_alu(ctx->bc, &alu);
5483		if (r)
5484			return r;
5485	}
5486	return 0;
5487}
5488
5489static int tgsi_dp(struct r600_shader_ctx *ctx)
5490{
5491	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5492	struct r600_bytecode_alu alu;
5493	int i, j, r;
5494
5495	for (i = 0; i < 4; i++) {
5496		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5497		alu.op = ctx->inst_info->op;
5498		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5499			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5500		}
5501
5502		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5503		alu.dst.chan = i;
5504		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5505		/* handle some special cases */
5506		switch (inst->Instruction.Opcode) {
5507		case TGSI_OPCODE_DP2:
5508			if (i > 1) {
5509				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5510				alu.src[0].chan = alu.src[1].chan = 0;
5511			}
5512			break;
5513		case TGSI_OPCODE_DP3:
5514			if (i > 2) {
5515				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5516				alu.src[0].chan = alu.src[1].chan = 0;
5517			}
5518			break;
5519		case TGSI_OPCODE_DPH:
5520			if (i == 3) {
5521				alu.src[0].sel = V_SQ_ALU_SRC_1;
5522				alu.src[0].chan = 0;
5523				alu.src[0].neg = 0;
5524			}
5525			break;
5526		default:
5527			break;
5528		}
5529		if (i == 3) {
5530			alu.last = 1;
5531		}
5532		r = r600_bytecode_add_alu(ctx->bc, &alu);
5533		if (r)
5534			return r;
5535	}
5536	return 0;
5537}
5538
5539static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5540						    unsigned index)
5541{
5542	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5543	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5544		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5545		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5546		ctx->src[index].neg || ctx->src[index].abs ||
5547		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5548}
5549
5550static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5551					unsigned index)
5552{
5553	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5554	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5555}
5556
5557static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5558{
5559	struct r600_bytecode_vtx vtx;
5560	struct r600_bytecode_alu alu;
5561	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5562	int src_gpr, r, i;
5563	int id = tgsi_tex_get_src_gpr(ctx, 1);
5564
5565	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5566	if (src_requires_loading) {
5567		for (i = 0; i < 4; i++) {
5568			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5569			alu.op = ALU_OP1_MOV;
5570			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5571			alu.dst.sel = ctx->temp_reg;
5572			alu.dst.chan = i;
5573			if (i == 3)
5574				alu.last = 1;
5575			alu.dst.write = 1;
5576			r = r600_bytecode_add_alu(ctx->bc, &alu);
5577			if (r)
5578				return r;
5579		}
5580		src_gpr = ctx->temp_reg;
5581	}
5582
5583	memset(&vtx, 0, sizeof(vtx));
5584	vtx.op = FETCH_OP_VFETCH;
5585	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5586	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5587	vtx.src_gpr = src_gpr;
5588	vtx.mega_fetch_count = 16;
5589	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5590	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
5591	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
5592	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
5593	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
5594	vtx.use_const_fields = 1;
5595
5596	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5597		return r;
5598
5599	if (ctx->bc->chip_class >= EVERGREEN)
5600		return 0;
5601
5602	for (i = 0; i < 4; i++) {
5603		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5604		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5605			continue;
5606
5607		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5608		alu.op = ALU_OP2_AND_INT;
5609
5610		alu.dst.chan = i;
5611		alu.dst.sel = vtx.dst_gpr;
5612		alu.dst.write = 1;
5613
5614		alu.src[0].sel = vtx.dst_gpr;
5615		alu.src[0].chan = i;
5616
5617		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5618		alu.src[1].sel += (id * 2);
5619		alu.src[1].chan = i % 4;
5620		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5621
5622		if (i == lasti)
5623			alu.last = 1;
5624		r = r600_bytecode_add_alu(ctx->bc, &alu);
5625		if (r)
5626			return r;
5627	}
5628
5629	if (inst->Dst[0].Register.WriteMask & 3) {
5630		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5631		alu.op = ALU_OP2_OR_INT;
5632
5633		alu.dst.chan = 3;
5634		alu.dst.sel = vtx.dst_gpr;
5635		alu.dst.write = 1;
5636
5637		alu.src[0].sel = vtx.dst_gpr;
5638		alu.src[0].chan = 3;
5639
5640		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5641		alu.src[1].chan = 0;
5642		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5643
5644		alu.last = 1;
5645		r = r600_bytecode_add_alu(ctx->bc, &alu);
5646		if (r)
5647			return r;
5648	}
5649	return 0;
5650}
5651
5652static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5653{
5654	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5655	struct r600_bytecode_alu alu;
5656	int r;
5657	int id = tgsi_tex_get_src_gpr(ctx, 1);
5658
5659	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5660	alu.op = ALU_OP1_MOV;
5661	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5662	if (ctx->bc->chip_class >= EVERGREEN) {
5663		/* channel 0 or 2 of each word */
5664		alu.src[0].sel += (id / 2);
5665		alu.src[0].chan = (id % 2) * 2;
5666	} else {
5667		/* r600 we have them at channel 2 of the second dword */
5668		alu.src[0].sel += (id * 2) + 1;
5669		alu.src[0].chan = 1;
5670	}
5671	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5672	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5673	alu.last = 1;
5674	r = r600_bytecode_add_alu(ctx->bc, &alu);
5675	if (r)
5676		return r;
5677	return 0;
5678}
5679
5680static int tgsi_tex(struct r600_shader_ctx *ctx)
5681{
5682	static float one_point_five = 1.5f;
5683	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5684	struct r600_bytecode_tex tex;
5685	struct r600_bytecode_alu alu;
5686	unsigned src_gpr;
5687	int r, i, j;
5688	int opcode;
5689	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5690				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5691				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5692				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5693
5694	bool txf_add_offsets = inst->Texture.NumOffsets &&
5695			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5696			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5697
5698	/* Texture fetch instructions can only use gprs as source.
5699	 * Also they cannot negate the source or take the absolute value */
5700	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5701					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
5702                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
5703					     read_compressed_msaa || txf_add_offsets;
5704
5705	boolean src_loaded = FALSE;
5706	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5707	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5708	boolean has_txq_cube_array_z = false;
5709	unsigned sampler_index_mode;
5710
5711	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5712	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5713	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5714		if (inst->Dst[0].Register.WriteMask & 4) {
5715			ctx->shader->has_txq_cube_array_z_comp = true;
5716			has_txq_cube_array_z = true;
5717		}
5718
5719	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5720	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5721	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5722	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5723		sampler_src_reg = 2;
5724
5725	/* TGSI moves the sampler to src reg 3 for TXD */
5726	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5727		sampler_src_reg = 3;
5728
5729	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5730
5731	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5732
5733	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5734		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5735			ctx->shader->uses_tex_buffers = true;
5736			return r600_do_buffer_txq(ctx);
5737		}
5738		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5739			if (ctx->bc->chip_class < EVERGREEN)
5740				ctx->shader->uses_tex_buffers = true;
5741			return do_vtx_fetch_inst(ctx, src_requires_loading);
5742		}
5743	}
5744
5745	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5746		int out_chan;
5747		/* Add perspective divide */
5748		if (ctx->bc->chip_class == CAYMAN) {
5749			out_chan = 2;
5750			for (i = 0; i < 3; i++) {
5751				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5752				alu.op = ALU_OP1_RECIP_IEEE;
5753				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5754
5755				alu.dst.sel = ctx->temp_reg;
5756				alu.dst.chan = i;
5757				if (i == 2)
5758					alu.last = 1;
5759				if (out_chan == i)
5760					alu.dst.write = 1;
5761				r = r600_bytecode_add_alu(ctx->bc, &alu);
5762				if (r)
5763					return r;
5764			}
5765
5766		} else {
5767			out_chan = 3;
5768			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5769			alu.op = ALU_OP1_RECIP_IEEE;
5770			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5771
5772			alu.dst.sel = ctx->temp_reg;
5773			alu.dst.chan = out_chan;
5774			alu.last = 1;
5775			alu.dst.write = 1;
5776			r = r600_bytecode_add_alu(ctx->bc, &alu);
5777			if (r)
5778				return r;
5779		}
5780
5781		for (i = 0; i < 3; i++) {
5782			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5783			alu.op = ALU_OP2_MUL;
5784			alu.src[0].sel = ctx->temp_reg;
5785			alu.src[0].chan = out_chan;
5786			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5787			alu.dst.sel = ctx->temp_reg;
5788			alu.dst.chan = i;
5789			alu.dst.write = 1;
5790			r = r600_bytecode_add_alu(ctx->bc, &alu);
5791			if (r)
5792				return r;
5793		}
5794		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5795		alu.op = ALU_OP1_MOV;
5796		alu.src[0].sel = V_SQ_ALU_SRC_1;
5797		alu.src[0].chan = 0;
5798		alu.dst.sel = ctx->temp_reg;
5799		alu.dst.chan = 3;
5800		alu.last = 1;
5801		alu.dst.write = 1;
5802		r = r600_bytecode_add_alu(ctx->bc, &alu);
5803		if (r)
5804			return r;
5805		src_loaded = TRUE;
5806		src_gpr = ctx->temp_reg;
5807	}
5808
5809
5810	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5811	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5812	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5813	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5814	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5815	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5816
5817		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5818		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5819
5820		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5821		for (i = 0; i < 4; i++) {
5822			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5823			alu.op = ALU_OP2_CUBE;
5824			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5825			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5826			alu.dst.sel = ctx->temp_reg;
5827			alu.dst.chan = i;
5828			if (i == 3)
5829				alu.last = 1;
5830			alu.dst.write = 1;
5831			r = r600_bytecode_add_alu(ctx->bc, &alu);
5832			if (r)
5833				return r;
5834		}
5835
5836		/* tmp1.z = RCP_e(|tmp1.z|) */
5837		if (ctx->bc->chip_class == CAYMAN) {
5838			for (i = 0; i < 3; i++) {
5839				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5840				alu.op = ALU_OP1_RECIP_IEEE;
5841				alu.src[0].sel = ctx->temp_reg;
5842				alu.src[0].chan = 2;
5843				alu.src[0].abs = 1;
5844				alu.dst.sel = ctx->temp_reg;
5845				alu.dst.chan = i;
5846				if (i == 2)
5847					alu.dst.write = 1;
5848				if (i == 2)
5849					alu.last = 1;
5850				r = r600_bytecode_add_alu(ctx->bc, &alu);
5851				if (r)
5852					return r;
5853			}
5854		} else {
5855			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5856			alu.op = ALU_OP1_RECIP_IEEE;
5857			alu.src[0].sel = ctx->temp_reg;
5858			alu.src[0].chan = 2;
5859			alu.src[0].abs = 1;
5860			alu.dst.sel = ctx->temp_reg;
5861			alu.dst.chan = 2;
5862			alu.dst.write = 1;
5863			alu.last = 1;
5864			r = r600_bytecode_add_alu(ctx->bc, &alu);
5865			if (r)
5866				return r;
5867		}
5868
5869		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
5870		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
5871		 * muladd has no writemask, have to use another temp
5872		 */
5873		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5874		alu.op = ALU_OP3_MULADD;
5875		alu.is_op3 = 1;
5876
5877		alu.src[0].sel = ctx->temp_reg;
5878		alu.src[0].chan = 0;
5879		alu.src[1].sel = ctx->temp_reg;
5880		alu.src[1].chan = 2;
5881
5882		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5883		alu.src[2].chan = 0;
5884		alu.src[2].value = *(uint32_t *)&one_point_five;
5885
5886		alu.dst.sel = ctx->temp_reg;
5887		alu.dst.chan = 0;
5888		alu.dst.write = 1;
5889
5890		r = r600_bytecode_add_alu(ctx->bc, &alu);
5891		if (r)
5892			return r;
5893
5894		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5895		alu.op = ALU_OP3_MULADD;
5896		alu.is_op3 = 1;
5897
5898		alu.src[0].sel = ctx->temp_reg;
5899		alu.src[0].chan = 1;
5900		alu.src[1].sel = ctx->temp_reg;
5901		alu.src[1].chan = 2;
5902
5903		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5904		alu.src[2].chan = 0;
5905		alu.src[2].value = *(uint32_t *)&one_point_five;
5906
5907		alu.dst.sel = ctx->temp_reg;
5908		alu.dst.chan = 1;
5909		alu.dst.write = 1;
5910
5911		alu.last = 1;
5912		r = r600_bytecode_add_alu(ctx->bc, &alu);
5913		if (r)
5914			return r;
5915		/* write initial compare value into Z component
5916		  - W src 0 for shadow cube
5917		  - X src 1 for shadow cube array */
5918		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5919		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5920			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5921			alu.op = ALU_OP1_MOV;
5922			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5923				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5924			else
5925				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5926			alu.dst.sel = ctx->temp_reg;
5927			alu.dst.chan = 2;
5928			alu.dst.write = 1;
5929			alu.last = 1;
5930			r = r600_bytecode_add_alu(ctx->bc, &alu);
5931			if (r)
5932				return r;
5933		}
5934
5935		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5936		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5937			if (ctx->bc->chip_class >= EVERGREEN) {
5938				int mytmp = r600_get_temp(ctx);
5939				static const float eight = 8.0f;
5940				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5941				alu.op = ALU_OP1_MOV;
5942				alu.src[0].sel = ctx->temp_reg;
5943				alu.src[0].chan = 3;
5944				alu.dst.sel = mytmp;
5945				alu.dst.chan = 0;
5946				alu.dst.write = 1;
5947				alu.last = 1;
5948				r = r600_bytecode_add_alu(ctx->bc, &alu);
5949				if (r)
5950					return r;
5951
5952				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5953				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5954				alu.op = ALU_OP3_MULADD;
5955				alu.is_op3 = 1;
5956				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5957				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5958				alu.src[1].chan = 0;
5959				alu.src[1].value = *(uint32_t *)&eight;
5960				alu.src[2].sel = mytmp;
5961				alu.src[2].chan = 0;
5962				alu.dst.sel = ctx->temp_reg;
5963				alu.dst.chan = 3;
5964				alu.dst.write = 1;
5965				alu.last = 1;
5966				r = r600_bytecode_add_alu(ctx->bc, &alu);
5967				if (r)
5968					return r;
5969			} else if (ctx->bc->chip_class < EVERGREEN) {
5970				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5971				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5972				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5973				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5974				tex.src_gpr = r600_get_temp(ctx);
5975				tex.src_sel_x = 0;
5976				tex.src_sel_y = 0;
5977				tex.src_sel_z = 0;
5978				tex.src_sel_w = 0;
5979				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5980				tex.coord_type_x = 1;
5981				tex.coord_type_y = 1;
5982				tex.coord_type_z = 1;
5983				tex.coord_type_w = 1;
5984				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5985				alu.op = ALU_OP1_MOV;
5986				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5987				alu.dst.sel = tex.src_gpr;
5988				alu.dst.chan = 0;
5989				alu.last = 1;
5990				alu.dst.write = 1;
5991				r = r600_bytecode_add_alu(ctx->bc, &alu);
5992				if (r)
5993					return r;
5994
5995				r = r600_bytecode_add_tex(ctx->bc, &tex);
5996				if (r)
5997					return r;
5998			}
5999
6000		}
6001
6002		/* for cube forms of lod and bias we need to route things */
6003		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
6004		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
6005		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6006		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
6007			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6008			alu.op = ALU_OP1_MOV;
6009			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6010			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
6011				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6012			else
6013				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6014			alu.dst.sel = ctx->temp_reg;
6015			alu.dst.chan = 2;
6016			alu.last = 1;
6017			alu.dst.write = 1;
6018			r = r600_bytecode_add_alu(ctx->bc, &alu);
6019			if (r)
6020				return r;
6021		}
6022
6023		src_loaded = TRUE;
6024		src_gpr = ctx->temp_reg;
6025	}
6026
6027	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6028		int temp_h = 0, temp_v = 0;
6029		int start_val = 0;
6030
6031		/* if we've already loaded the src (i.e. CUBE don't reload it). */
6032		if (src_loaded == TRUE)
6033			start_val = 1;
6034		else
6035			src_loaded = TRUE;
6036		for (i = start_val; i < 3; i++) {
6037			int treg = r600_get_temp(ctx);
6038
6039			if (i == 0)
6040				src_gpr = treg;
6041			else if (i == 1)
6042				temp_h = treg;
6043			else
6044				temp_v = treg;
6045
6046			for (j = 0; j < 4; j++) {
6047				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6048				alu.op = ALU_OP1_MOV;
6049                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6050                                alu.dst.sel = treg;
6051                                alu.dst.chan = j;
6052                                if (j == 3)
6053                                   alu.last = 1;
6054                                alu.dst.write = 1;
6055                                r = r600_bytecode_add_alu(ctx->bc, &alu);
6056                                if (r)
6057                                    return r;
6058			}
6059		}
6060		for (i = 1; i < 3; i++) {
6061			/* set gradients h/v */
6062			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6063			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6064				FETCH_OP_SET_GRADIENTS_V;
6065			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6066			tex.sampler_index_mode = sampler_index_mode;
6067			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6068			tex.resource_index_mode = sampler_index_mode;
6069
6070			tex.src_gpr = (i == 1) ? temp_h : temp_v;
6071			tex.src_sel_x = 0;
6072			tex.src_sel_y = 1;
6073			tex.src_sel_z = 2;
6074			tex.src_sel_w = 3;
6075
6076			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6077			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6078			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6079				tex.coord_type_x = 1;
6080				tex.coord_type_y = 1;
6081				tex.coord_type_z = 1;
6082				tex.coord_type_w = 1;
6083			}
6084			r = r600_bytecode_add_tex(ctx->bc, &tex);
6085			if (r)
6086				return r;
6087		}
6088	}
6089
6090	if (src_requires_loading && !src_loaded) {
6091		for (i = 0; i < 4; i++) {
6092			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6093			alu.op = ALU_OP1_MOV;
6094			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6095			alu.dst.sel = ctx->temp_reg;
6096			alu.dst.chan = i;
6097			if (i == 3)
6098				alu.last = 1;
6099			alu.dst.write = 1;
6100			r = r600_bytecode_add_alu(ctx->bc, &alu);
6101			if (r)
6102				return r;
6103		}
6104		src_loaded = TRUE;
6105		src_gpr = ctx->temp_reg;
6106	}
6107
6108	/* get offset values */
6109	if (inst->Texture.NumOffsets) {
6110		assert(inst->Texture.NumOffsets == 1);
6111
6112		/* The texture offset feature doesn't work with the TXF instruction
6113		 * and must be emulated by adding the offset to the texture coordinates. */
6114		if (txf_add_offsets) {
6115			const struct tgsi_texture_offset *off = inst->TexOffsets;
6116
6117			switch (inst->Texture.Texture) {
6118			case TGSI_TEXTURE_3D:
6119				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6120				alu.op = ALU_OP2_ADD_INT;
6121				alu.src[0].sel = src_gpr;
6122				alu.src[0].chan = 2;
6123				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6124				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6125				alu.dst.sel = src_gpr;
6126				alu.dst.chan = 2;
6127				alu.dst.write = 1;
6128				alu.last = 1;
6129				r = r600_bytecode_add_alu(ctx->bc, &alu);
6130				if (r)
6131					return r;
6132				/* fall through */
6133
6134			case TGSI_TEXTURE_2D:
6135			case TGSI_TEXTURE_SHADOW2D:
6136			case TGSI_TEXTURE_RECT:
6137			case TGSI_TEXTURE_SHADOWRECT:
6138			case TGSI_TEXTURE_2D_ARRAY:
6139			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6140				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6141				alu.op = ALU_OP2_ADD_INT;
6142				alu.src[0].sel = src_gpr;
6143				alu.src[0].chan = 1;
6144				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6145				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6146				alu.dst.sel = src_gpr;
6147				alu.dst.chan = 1;
6148				alu.dst.write = 1;
6149				alu.last = 1;
6150				r = r600_bytecode_add_alu(ctx->bc, &alu);
6151				if (r)
6152					return r;
6153				/* fall through */
6154
6155			case TGSI_TEXTURE_1D:
6156			case TGSI_TEXTURE_SHADOW1D:
6157			case TGSI_TEXTURE_1D_ARRAY:
6158			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6159				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6160				alu.op = ALU_OP2_ADD_INT;
6161				alu.src[0].sel = src_gpr;
6162				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6163				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6164				alu.dst.sel = src_gpr;
6165				alu.dst.write = 1;
6166				alu.last = 1;
6167				r = r600_bytecode_add_alu(ctx->bc, &alu);
6168				if (r)
6169					return r;
6170				break;
6171				/* texture offsets do not apply to other texture targets */
6172			}
6173		} else {
6174			switch (inst->Texture.Texture) {
6175			case TGSI_TEXTURE_3D:
6176				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6177				/* fallthrough */
6178			case TGSI_TEXTURE_2D:
6179			case TGSI_TEXTURE_SHADOW2D:
6180			case TGSI_TEXTURE_RECT:
6181			case TGSI_TEXTURE_SHADOWRECT:
6182			case TGSI_TEXTURE_2D_ARRAY:
6183			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6184				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6185				/* fallthrough */
6186			case TGSI_TEXTURE_1D:
6187			case TGSI_TEXTURE_SHADOW1D:
6188			case TGSI_TEXTURE_1D_ARRAY:
6189			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6190				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6191			}
6192		}
6193	}
6194
6195	/* Obtain the sample index for reading a compressed MSAA color texture.
6196	 * To read the FMASK, we use the ldfptr instruction, which tells us
6197	 * where the samples are stored.
6198	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6199	 * which is the identity mapping. Each nibble says which physical sample
6200	 * should be fetched to get that sample.
6201	 *
6202	 * Assume src.z contains the sample index. It should be modified like this:
6203	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6204	 * Then fetch the texel with src.
6205	 */
6206	if (read_compressed_msaa) {
6207		unsigned sample_chan = 3;
6208		unsigned temp = r600_get_temp(ctx);
6209		assert(src_loaded);
6210
6211		/* temp.w = ldfptr() */
6212		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6213		tex.op = FETCH_OP_LD;
6214		tex.inst_mod = 1; /* to indicate this is ldfptr */
6215		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6216		tex.sampler_index_mode = sampler_index_mode;
6217		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6218		tex.resource_index_mode = sampler_index_mode;
6219		tex.src_gpr = src_gpr;
6220		tex.dst_gpr = temp;
6221		tex.dst_sel_x = 7; /* mask out these components */
6222		tex.dst_sel_y = 7;
6223		tex.dst_sel_z = 7;
6224		tex.dst_sel_w = 0; /* store X */
6225		tex.src_sel_x = 0;
6226		tex.src_sel_y = 1;
6227		tex.src_sel_z = 2;
6228		tex.src_sel_w = 3;
6229		tex.offset_x = offset_x;
6230		tex.offset_y = offset_y;
6231		tex.offset_z = offset_z;
6232		r = r600_bytecode_add_tex(ctx->bc, &tex);
6233		if (r)
6234			return r;
6235
6236		/* temp.x = sample_index*4 */
6237		if (ctx->bc->chip_class == CAYMAN) {
6238			for (i = 0 ; i < 4; i++) {
6239				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6240				alu.op = ALU_OP2_MULLO_INT;
6241				alu.src[0].sel = src_gpr;
6242				alu.src[0].chan = sample_chan;
6243				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6244				alu.src[1].value = 4;
6245				alu.dst.sel = temp;
6246				alu.dst.chan = i;
6247				alu.dst.write = i == 0;
6248				if (i == 3)
6249					alu.last = 1;
6250				r = r600_bytecode_add_alu(ctx->bc, &alu);
6251				if (r)
6252					return r;
6253			}
6254		} else {
6255			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6256			alu.op = ALU_OP2_MULLO_INT;
6257			alu.src[0].sel = src_gpr;
6258			alu.src[0].chan = sample_chan;
6259			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6260			alu.src[1].value = 4;
6261			alu.dst.sel = temp;
6262			alu.dst.chan = 0;
6263			alu.dst.write = 1;
6264			alu.last = 1;
6265			r = r600_bytecode_add_alu(ctx->bc, &alu);
6266			if (r)
6267				return r;
6268		}
6269
6270		/* sample_index = temp.w >> temp.x */
6271		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6272		alu.op = ALU_OP2_LSHR_INT;
6273		alu.src[0].sel = temp;
6274		alu.src[0].chan = 3;
6275		alu.src[1].sel = temp;
6276		alu.src[1].chan = 0;
6277		alu.dst.sel = src_gpr;
6278		alu.dst.chan = sample_chan;
6279		alu.dst.write = 1;
6280		alu.last = 1;
6281		r = r600_bytecode_add_alu(ctx->bc, &alu);
6282		if (r)
6283			return r;
6284
6285		/* sample_index & 0xF */
6286		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6287		alu.op = ALU_OP2_AND_INT;
6288		alu.src[0].sel = src_gpr;
6289		alu.src[0].chan = sample_chan;
6290		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6291		alu.src[1].value = 0xF;
6292		alu.dst.sel = src_gpr;
6293		alu.dst.chan = sample_chan;
6294		alu.dst.write = 1;
6295		alu.last = 1;
6296		r = r600_bytecode_add_alu(ctx->bc, &alu);
6297		if (r)
6298			return r;
6299#if 0
6300		/* visualize the FMASK */
6301		for (i = 0; i < 4; i++) {
6302			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6303			alu.op = ALU_OP1_INT_TO_FLT;
6304			alu.src[0].sel = src_gpr;
6305			alu.src[0].chan = sample_chan;
6306			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6307			alu.dst.chan = i;
6308			alu.dst.write = 1;
6309			alu.last = 1;
6310			r = r600_bytecode_add_alu(ctx->bc, &alu);
6311			if (r)
6312				return r;
6313		}
6314		return 0;
6315#endif
6316	}
6317
6318	/* does this shader want a num layers from TXQ for a cube array? */
6319	if (has_txq_cube_array_z) {
6320		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6321
6322		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6323		alu.op = ALU_OP1_MOV;
6324
6325		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6326		if (ctx->bc->chip_class >= EVERGREEN) {
6327			/* channel 1 or 3 of each word */
6328			alu.src[0].sel += (id / 2);
6329			alu.src[0].chan = ((id % 2) * 2) + 1;
6330		} else {
6331			/* r600 we have them at channel 2 of the second dword */
6332			alu.src[0].sel += (id * 2) + 1;
6333			alu.src[0].chan = 2;
6334		}
6335		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6336		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6337		alu.last = 1;
6338		r = r600_bytecode_add_alu(ctx->bc, &alu);
6339		if (r)
6340			return r;
6341		/* disable writemask from texture instruction */
6342		inst->Dst[0].Register.WriteMask &= ~4;
6343	}
6344
6345	opcode = ctx->inst_info->op;
6346	if (opcode == FETCH_OP_GATHER4 &&
6347		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6348		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6349		opcode = FETCH_OP_GATHER4_O;
6350
6351		/* GATHER4_O/GATHER4_C_O use offset values loaded by
6352		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
6353		   encoded in the instruction are ignored. */
6354		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6355		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6356		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6357		tex.sampler_index_mode = sampler_index_mode;
6358		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6359		tex.resource_index_mode = sampler_index_mode;
6360
6361		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6362		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6363		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6364		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6365		tex.src_sel_w = 4;
6366
6367		tex.dst_sel_x = 7;
6368		tex.dst_sel_y = 7;
6369		tex.dst_sel_z = 7;
6370		tex.dst_sel_w = 7;
6371
6372		r = r600_bytecode_add_tex(ctx->bc, &tex);
6373		if (r)
6374			return r;
6375	}
6376
6377	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6378	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6379	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6380	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6381	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6382	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6383	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6384		switch (opcode) {
6385		case FETCH_OP_SAMPLE:
6386			opcode = FETCH_OP_SAMPLE_C;
6387			break;
6388		case FETCH_OP_SAMPLE_L:
6389			opcode = FETCH_OP_SAMPLE_C_L;
6390			break;
6391		case FETCH_OP_SAMPLE_LB:
6392			opcode = FETCH_OP_SAMPLE_C_LB;
6393			break;
6394		case FETCH_OP_SAMPLE_G:
6395			opcode = FETCH_OP_SAMPLE_C_G;
6396			break;
6397		/* Texture gather variants */
6398		case FETCH_OP_GATHER4:
6399			opcode = FETCH_OP_GATHER4_C;
6400			break;
6401		case FETCH_OP_GATHER4_O:
6402			opcode = FETCH_OP_GATHER4_C_O;
6403			break;
6404		}
6405	}
6406
6407	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6408	tex.op = opcode;
6409
6410	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6411	tex.sampler_index_mode = sampler_index_mode;
6412	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6413	tex.resource_index_mode = sampler_index_mode;
6414	tex.src_gpr = src_gpr;
6415	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6416
6417	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6418		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6419		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6420	}
6421
6422	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6423		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6424		tex.inst_mod = texture_component_select;
6425
6426		if (ctx->bc->chip_class == CAYMAN) {
6427		/* GATHER4 result order is different from TGSI TG4 */
6428			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6429			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6430			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6431			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6432		} else {
6433			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6434			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6435			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6436			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6437		}
6438	}
6439	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6440		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6441		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6442		tex.dst_sel_z = 7;
6443		tex.dst_sel_w = 7;
6444	}
6445	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6446		tex.dst_sel_x = 3;
6447		tex.dst_sel_y = 7;
6448		tex.dst_sel_z = 7;
6449		tex.dst_sel_w = 7;
6450	}
6451	else {
6452		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6453		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6454		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6455		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6456	}
6457
6458
6459	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
6460	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6461		tex.src_sel_x = 4;
6462		tex.src_sel_y = 4;
6463		tex.src_sel_z = 4;
6464		tex.src_sel_w = 4;
6465	} else if (src_loaded) {
6466		tex.src_sel_x = 0;
6467		tex.src_sel_y = 1;
6468		tex.src_sel_z = 2;
6469		tex.src_sel_w = 3;
6470	} else {
6471		tex.src_sel_x = ctx->src[0].swizzle[0];
6472		tex.src_sel_y = ctx->src[0].swizzle[1];
6473		tex.src_sel_z = ctx->src[0].swizzle[2];
6474		tex.src_sel_w = ctx->src[0].swizzle[3];
6475		tex.src_rel = ctx->src[0].rel;
6476	}
6477
6478	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6479	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6480	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6481	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6482		tex.src_sel_x = 1;
6483		tex.src_sel_y = 0;
6484		tex.src_sel_z = 3;
6485		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6486	}
6487
6488	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6489	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6490		tex.coord_type_x = 1;
6491		tex.coord_type_y = 1;
6492	}
6493	tex.coord_type_z = 1;
6494	tex.coord_type_w = 1;
6495
6496	tex.offset_x = offset_x;
6497	tex.offset_y = offset_y;
6498	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6499		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6500		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6501		tex.offset_z = 0;
6502	}
6503	else {
6504		tex.offset_z = offset_z;
6505	}
6506
6507	/* Put the depth for comparison in W.
6508	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6509	 * Some instructions expect the depth in Z. */
6510	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6511	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6512	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6513	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6514	    opcode != FETCH_OP_SAMPLE_C_L &&
6515	    opcode != FETCH_OP_SAMPLE_C_LB) {
6516		tex.src_sel_w = tex.src_sel_z;
6517	}
6518
6519	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6520	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6521		if (opcode == FETCH_OP_SAMPLE_C_L ||
6522		    opcode == FETCH_OP_SAMPLE_C_LB) {
6523			/* the array index is read from Y */
6524			tex.coord_type_y = 0;
6525		} else {
6526			/* the array index is read from Z */
6527			tex.coord_type_z = 0;
6528			tex.src_sel_z = tex.src_sel_y;
6529		}
6530	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6531		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6532		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6533		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6534		    (ctx->bc->chip_class >= EVERGREEN)))
6535		/* the array index is read from Z */
6536		tex.coord_type_z = 0;
6537
6538	/* mask unused source components */
6539	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6540		switch (inst->Texture.Texture) {
6541		case TGSI_TEXTURE_2D:
6542		case TGSI_TEXTURE_RECT:
6543			tex.src_sel_z = 7;
6544			tex.src_sel_w = 7;
6545			break;
6546		case TGSI_TEXTURE_1D_ARRAY:
6547			tex.src_sel_y = 7;
6548			tex.src_sel_w = 7;
6549			break;
6550		case TGSI_TEXTURE_1D:
6551			tex.src_sel_y = 7;
6552			tex.src_sel_z = 7;
6553			tex.src_sel_w = 7;
6554			break;
6555		}
6556	}
6557
6558	r = r600_bytecode_add_tex(ctx->bc, &tex);
6559	if (r)
6560		return r;
6561
6562	/* add shadow ambient support  - gallium doesn't do it yet */
6563	return 0;
6564}
6565
6566static int tgsi_lrp(struct r600_shader_ctx *ctx)
6567{
6568	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6569	struct r600_bytecode_alu alu;
6570	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6571	unsigned i, temp_regs[2];
6572	int r;
6573
6574	/* optimize if it's just an equal balance */
6575	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6576		for (i = 0; i < lasti + 1; i++) {
6577			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6578				continue;
6579
6580			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6581			alu.op = ALU_OP2_ADD;
6582			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6583			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6584			alu.omod = 3;
6585			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6586			alu.dst.chan = i;
6587			if (i == lasti) {
6588				alu.last = 1;
6589			}
6590			r = r600_bytecode_add_alu(ctx->bc, &alu);
6591			if (r)
6592				return r;
6593		}
6594		return 0;
6595	}
6596
6597	/* 1 - src0 */
6598	for (i = 0; i < lasti + 1; i++) {
6599		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6600			continue;
6601
6602		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6603		alu.op = ALU_OP2_ADD;
6604		alu.src[0].sel = V_SQ_ALU_SRC_1;
6605		alu.src[0].chan = 0;
6606		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6607		r600_bytecode_src_toggle_neg(&alu.src[1]);
6608		alu.dst.sel = ctx->temp_reg;
6609		alu.dst.chan = i;
6610		if (i == lasti) {
6611			alu.last = 1;
6612		}
6613		alu.dst.write = 1;
6614		r = r600_bytecode_add_alu(ctx->bc, &alu);
6615		if (r)
6616			return r;
6617	}
6618
6619	/* (1 - src0) * src2 */
6620	for (i = 0; i < lasti + 1; i++) {
6621		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6622			continue;
6623
6624		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6625		alu.op = ALU_OP2_MUL;
6626		alu.src[0].sel = ctx->temp_reg;
6627		alu.src[0].chan = i;
6628		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6629		alu.dst.sel = ctx->temp_reg;
6630		alu.dst.chan = i;
6631		if (i == lasti) {
6632			alu.last = 1;
6633		}
6634		alu.dst.write = 1;
6635		r = r600_bytecode_add_alu(ctx->bc, &alu);
6636		if (r)
6637			return r;
6638	}
6639
6640	/* src0 * src1 + (1 - src0) * src2 */
6641        if (ctx->src[0].abs)
6642		temp_regs[0] = r600_get_temp(ctx);
6643	else
6644		temp_regs[0] = 0;
6645	if (ctx->src[1].abs)
6646		temp_regs[1] = r600_get_temp(ctx);
6647	else
6648		temp_regs[1] = 0;
6649
6650	for (i = 0; i < lasti + 1; i++) {
6651		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6652			continue;
6653
6654		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6655		alu.op = ALU_OP3_MULADD;
6656		alu.is_op3 = 1;
6657		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6658		if (r)
6659			return r;
6660		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6661		if (r)
6662			return r;
6663		alu.src[2].sel = ctx->temp_reg;
6664		alu.src[2].chan = i;
6665
6666		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6667		alu.dst.chan = i;
6668		if (i == lasti) {
6669			alu.last = 1;
6670		}
6671		r = r600_bytecode_add_alu(ctx->bc, &alu);
6672		if (r)
6673			return r;
6674	}
6675	return 0;
6676}
6677
6678static int tgsi_cmp(struct r600_shader_ctx *ctx)
6679{
6680	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6681	struct r600_bytecode_alu alu;
6682	int i, r, j;
6683	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6684	int temp_regs[3];
6685
6686	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6687		temp_regs[j] = 0;
6688		if (ctx->src[j].abs)
6689			temp_regs[j] = r600_get_temp(ctx);
6690	}
6691
6692	for (i = 0; i < lasti + 1; i++) {
6693		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6694			continue;
6695
6696		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6697		alu.op = ALU_OP3_CNDGE;
6698		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6699		if (r)
6700			return r;
6701		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6702		if (r)
6703			return r;
6704		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6705		if (r)
6706			return r;
6707		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6708		alu.dst.chan = i;
6709		alu.dst.write = 1;
6710		alu.is_op3 = 1;
6711		if (i == lasti)
6712			alu.last = 1;
6713		r = r600_bytecode_add_alu(ctx->bc, &alu);
6714		if (r)
6715			return r;
6716	}
6717	return 0;
6718}
6719
6720static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6721{
6722	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6723	struct r600_bytecode_alu alu;
6724	int i, r;
6725	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6726
6727	for (i = 0; i < lasti + 1; i++) {
6728		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6729			continue;
6730
6731		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6732		alu.op = ALU_OP3_CNDE_INT;
6733		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6734		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6735		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6736		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6737		alu.dst.chan = i;
6738		alu.dst.write = 1;
6739		alu.is_op3 = 1;
6740		if (i == lasti)
6741			alu.last = 1;
6742		r = r600_bytecode_add_alu(ctx->bc, &alu);
6743		if (r)
6744			return r;
6745	}
6746	return 0;
6747}
6748
6749static int tgsi_xpd(struct r600_shader_ctx *ctx)
6750{
6751	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6752	static const unsigned int src0_swizzle[] = {2, 0, 1};
6753	static const unsigned int src1_swizzle[] = {1, 2, 0};
6754	struct r600_bytecode_alu alu;
6755	uint32_t use_temp = 0;
6756	int i, r;
6757
6758	if (inst->Dst[0].Register.WriteMask != 0xf)
6759		use_temp = 1;
6760
6761	for (i = 0; i < 4; i++) {
6762		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6763		alu.op = ALU_OP2_MUL;
6764		if (i < 3) {
6765			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6766			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6767		} else {
6768			alu.src[0].sel = V_SQ_ALU_SRC_0;
6769			alu.src[0].chan = i;
6770			alu.src[1].sel = V_SQ_ALU_SRC_0;
6771			alu.src[1].chan = i;
6772		}
6773
6774		alu.dst.sel = ctx->temp_reg;
6775		alu.dst.chan = i;
6776		alu.dst.write = 1;
6777
6778		if (i == 3)
6779			alu.last = 1;
6780		r = r600_bytecode_add_alu(ctx->bc, &alu);
6781		if (r)
6782			return r;
6783	}
6784
6785	for (i = 0; i < 4; i++) {
6786		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6787		alu.op = ALU_OP3_MULADD;
6788
6789		if (i < 3) {
6790			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6791			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6792		} else {
6793			alu.src[0].sel = V_SQ_ALU_SRC_0;
6794			alu.src[0].chan = i;
6795			alu.src[1].sel = V_SQ_ALU_SRC_0;
6796			alu.src[1].chan = i;
6797		}
6798
6799		alu.src[2].sel = ctx->temp_reg;
6800		alu.src[2].neg = 1;
6801		alu.src[2].chan = i;
6802
6803		if (use_temp)
6804			alu.dst.sel = ctx->temp_reg;
6805		else
6806			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6807		alu.dst.chan = i;
6808		alu.dst.write = 1;
6809		alu.is_op3 = 1;
6810		if (i == 3)
6811			alu.last = 1;
6812		r = r600_bytecode_add_alu(ctx->bc, &alu);
6813		if (r)
6814			return r;
6815	}
6816	if (use_temp)
6817		return tgsi_helper_copy(ctx, inst);
6818	return 0;
6819}
6820
6821static int tgsi_exp(struct r600_shader_ctx *ctx)
6822{
6823	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6824	struct r600_bytecode_alu alu;
6825	int r;
6826	int i;
6827
6828	/* result.x = 2^floor(src); */
6829	if (inst->Dst[0].Register.WriteMask & 1) {
6830		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6831
6832		alu.op = ALU_OP1_FLOOR;
6833		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6834
6835		alu.dst.sel = ctx->temp_reg;
6836		alu.dst.chan = 0;
6837		alu.dst.write = 1;
6838		alu.last = 1;
6839		r = r600_bytecode_add_alu(ctx->bc, &alu);
6840		if (r)
6841			return r;
6842
6843		if (ctx->bc->chip_class == CAYMAN) {
6844			for (i = 0; i < 3; i++) {
6845				alu.op = ALU_OP1_EXP_IEEE;
6846				alu.src[0].sel = ctx->temp_reg;
6847				alu.src[0].chan = 0;
6848
6849				alu.dst.sel = ctx->temp_reg;
6850				alu.dst.chan = i;
6851				alu.dst.write = i == 0;
6852				alu.last = i == 2;
6853				r = r600_bytecode_add_alu(ctx->bc, &alu);
6854				if (r)
6855					return r;
6856			}
6857		} else {
6858			alu.op = ALU_OP1_EXP_IEEE;
6859			alu.src[0].sel = ctx->temp_reg;
6860			alu.src[0].chan = 0;
6861
6862			alu.dst.sel = ctx->temp_reg;
6863			alu.dst.chan = 0;
6864			alu.dst.write = 1;
6865			alu.last = 1;
6866			r = r600_bytecode_add_alu(ctx->bc, &alu);
6867			if (r)
6868				return r;
6869		}
6870	}
6871
6872	/* result.y = tmp - floor(tmp); */
6873	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6874		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6875
6876		alu.op = ALU_OP1_FRACT;
6877		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6878
6879		alu.dst.sel = ctx->temp_reg;
6880#if 0
6881		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6882		if (r)
6883			return r;
6884#endif
6885		alu.dst.write = 1;
6886		alu.dst.chan = 1;
6887
6888		alu.last = 1;
6889
6890		r = r600_bytecode_add_alu(ctx->bc, &alu);
6891		if (r)
6892			return r;
6893	}
6894
6895	/* result.z = RoughApprox2ToX(tmp);*/
6896	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6897		if (ctx->bc->chip_class == CAYMAN) {
6898			for (i = 0; i < 3; i++) {
6899				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6900				alu.op = ALU_OP1_EXP_IEEE;
6901				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6902
6903				alu.dst.sel = ctx->temp_reg;
6904				alu.dst.chan = i;
6905				if (i == 2) {
6906					alu.dst.write = 1;
6907					alu.last = 1;
6908				}
6909
6910				r = r600_bytecode_add_alu(ctx->bc, &alu);
6911				if (r)
6912					return r;
6913			}
6914		} else {
6915			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6916			alu.op = ALU_OP1_EXP_IEEE;
6917			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6918
6919			alu.dst.sel = ctx->temp_reg;
6920			alu.dst.write = 1;
6921			alu.dst.chan = 2;
6922
6923			alu.last = 1;
6924
6925			r = r600_bytecode_add_alu(ctx->bc, &alu);
6926			if (r)
6927				return r;
6928		}
6929	}
6930
6931	/* result.w = 1.0;*/
6932	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6933		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6934
6935		alu.op = ALU_OP1_MOV;
6936		alu.src[0].sel = V_SQ_ALU_SRC_1;
6937		alu.src[0].chan = 0;
6938
6939		alu.dst.sel = ctx->temp_reg;
6940		alu.dst.chan = 3;
6941		alu.dst.write = 1;
6942		alu.last = 1;
6943		r = r600_bytecode_add_alu(ctx->bc, &alu);
6944		if (r)
6945			return r;
6946	}
6947	return tgsi_helper_copy(ctx, inst);
6948}
6949
6950static int tgsi_log(struct r600_shader_ctx *ctx)
6951{
6952	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6953	struct r600_bytecode_alu alu;
6954	int r;
6955	int i;
6956
6957	/* result.x = floor(log2(|src|)); */
6958	if (inst->Dst[0].Register.WriteMask & 1) {
6959		if (ctx->bc->chip_class == CAYMAN) {
6960			for (i = 0; i < 3; i++) {
6961				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6962
6963				alu.op = ALU_OP1_LOG_IEEE;
6964				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6965				r600_bytecode_src_set_abs(&alu.src[0]);
6966
6967				alu.dst.sel = ctx->temp_reg;
6968				alu.dst.chan = i;
6969				if (i == 0)
6970					alu.dst.write = 1;
6971				if (i == 2)
6972					alu.last = 1;
6973				r = r600_bytecode_add_alu(ctx->bc, &alu);
6974				if (r)
6975					return r;
6976			}
6977
6978		} else {
6979			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6980
6981			alu.op = ALU_OP1_LOG_IEEE;
6982			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6983			r600_bytecode_src_set_abs(&alu.src[0]);
6984
6985			alu.dst.sel = ctx->temp_reg;
6986			alu.dst.chan = 0;
6987			alu.dst.write = 1;
6988			alu.last = 1;
6989			r = r600_bytecode_add_alu(ctx->bc, &alu);
6990			if (r)
6991				return r;
6992		}
6993
6994		alu.op = ALU_OP1_FLOOR;
6995		alu.src[0].sel = ctx->temp_reg;
6996		alu.src[0].chan = 0;
6997
6998		alu.dst.sel = ctx->temp_reg;
6999		alu.dst.chan = 0;
7000		alu.dst.write = 1;
7001		alu.last = 1;
7002
7003		r = r600_bytecode_add_alu(ctx->bc, &alu);
7004		if (r)
7005			return r;
7006	}
7007
7008	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
7009	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7010
7011		if (ctx->bc->chip_class == CAYMAN) {
7012			for (i = 0; i < 3; i++) {
7013				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7014
7015				alu.op = ALU_OP1_LOG_IEEE;
7016				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7017				r600_bytecode_src_set_abs(&alu.src[0]);
7018
7019				alu.dst.sel = ctx->temp_reg;
7020				alu.dst.chan = i;
7021				if (i == 1)
7022					alu.dst.write = 1;
7023				if (i == 2)
7024					alu.last = 1;
7025
7026				r = r600_bytecode_add_alu(ctx->bc, &alu);
7027				if (r)
7028					return r;
7029			}
7030		} else {
7031			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7032
7033			alu.op = ALU_OP1_LOG_IEEE;
7034			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7035			r600_bytecode_src_set_abs(&alu.src[0]);
7036
7037			alu.dst.sel = ctx->temp_reg;
7038			alu.dst.chan = 1;
7039			alu.dst.write = 1;
7040			alu.last = 1;
7041
7042			r = r600_bytecode_add_alu(ctx->bc, &alu);
7043			if (r)
7044				return r;
7045		}
7046
7047		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7048
7049		alu.op = ALU_OP1_FLOOR;
7050		alu.src[0].sel = ctx->temp_reg;
7051		alu.src[0].chan = 1;
7052
7053		alu.dst.sel = ctx->temp_reg;
7054		alu.dst.chan = 1;
7055		alu.dst.write = 1;
7056		alu.last = 1;
7057
7058		r = r600_bytecode_add_alu(ctx->bc, &alu);
7059		if (r)
7060			return r;
7061
7062		if (ctx->bc->chip_class == CAYMAN) {
7063			for (i = 0; i < 3; i++) {
7064				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7065				alu.op = ALU_OP1_EXP_IEEE;
7066				alu.src[0].sel = ctx->temp_reg;
7067				alu.src[0].chan = 1;
7068
7069				alu.dst.sel = ctx->temp_reg;
7070				alu.dst.chan = i;
7071				if (i == 1)
7072					alu.dst.write = 1;
7073				if (i == 2)
7074					alu.last = 1;
7075
7076				r = r600_bytecode_add_alu(ctx->bc, &alu);
7077				if (r)
7078					return r;
7079			}
7080		} else {
7081			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7082			alu.op = ALU_OP1_EXP_IEEE;
7083			alu.src[0].sel = ctx->temp_reg;
7084			alu.src[0].chan = 1;
7085
7086			alu.dst.sel = ctx->temp_reg;
7087			alu.dst.chan = 1;
7088			alu.dst.write = 1;
7089			alu.last = 1;
7090
7091			r = r600_bytecode_add_alu(ctx->bc, &alu);
7092			if (r)
7093				return r;
7094		}
7095
7096		if (ctx->bc->chip_class == CAYMAN) {
7097			for (i = 0; i < 3; i++) {
7098				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7099				alu.op = ALU_OP1_RECIP_IEEE;
7100				alu.src[0].sel = ctx->temp_reg;
7101				alu.src[0].chan = 1;
7102
7103				alu.dst.sel = ctx->temp_reg;
7104				alu.dst.chan = i;
7105				if (i == 1)
7106					alu.dst.write = 1;
7107				if (i == 2)
7108					alu.last = 1;
7109
7110				r = r600_bytecode_add_alu(ctx->bc, &alu);
7111				if (r)
7112					return r;
7113			}
7114		} else {
7115			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7116			alu.op = ALU_OP1_RECIP_IEEE;
7117			alu.src[0].sel = ctx->temp_reg;
7118			alu.src[0].chan = 1;
7119
7120			alu.dst.sel = ctx->temp_reg;
7121			alu.dst.chan = 1;
7122			alu.dst.write = 1;
7123			alu.last = 1;
7124
7125			r = r600_bytecode_add_alu(ctx->bc, &alu);
7126			if (r)
7127				return r;
7128		}
7129
7130		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7131
7132		alu.op = ALU_OP2_MUL;
7133
7134		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7135		r600_bytecode_src_set_abs(&alu.src[0]);
7136
7137		alu.src[1].sel = ctx->temp_reg;
7138		alu.src[1].chan = 1;
7139
7140		alu.dst.sel = ctx->temp_reg;
7141		alu.dst.chan = 1;
7142		alu.dst.write = 1;
7143		alu.last = 1;
7144
7145		r = r600_bytecode_add_alu(ctx->bc, &alu);
7146		if (r)
7147			return r;
7148	}
7149
7150	/* result.z = log2(|src|);*/
7151	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7152		if (ctx->bc->chip_class == CAYMAN) {
7153			for (i = 0; i < 3; i++) {
7154				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7155
7156				alu.op = ALU_OP1_LOG_IEEE;
7157				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7158				r600_bytecode_src_set_abs(&alu.src[0]);
7159
7160				alu.dst.sel = ctx->temp_reg;
7161				if (i == 2)
7162					alu.dst.write = 1;
7163				alu.dst.chan = i;
7164				if (i == 2)
7165					alu.last = 1;
7166
7167				r = r600_bytecode_add_alu(ctx->bc, &alu);
7168				if (r)
7169					return r;
7170			}
7171		} else {
7172			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7173
7174			alu.op = ALU_OP1_LOG_IEEE;
7175			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7176			r600_bytecode_src_set_abs(&alu.src[0]);
7177
7178			alu.dst.sel = ctx->temp_reg;
7179			alu.dst.write = 1;
7180			alu.dst.chan = 2;
7181			alu.last = 1;
7182
7183			r = r600_bytecode_add_alu(ctx->bc, &alu);
7184			if (r)
7185				return r;
7186		}
7187	}
7188
7189	/* result.w = 1.0; */
7190	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7191		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7192
7193		alu.op = ALU_OP1_MOV;
7194		alu.src[0].sel = V_SQ_ALU_SRC_1;
7195		alu.src[0].chan = 0;
7196
7197		alu.dst.sel = ctx->temp_reg;
7198		alu.dst.chan = 3;
7199		alu.dst.write = 1;
7200		alu.last = 1;
7201
7202		r = r600_bytecode_add_alu(ctx->bc, &alu);
7203		if (r)
7204			return r;
7205	}
7206
7207	return tgsi_helper_copy(ctx, inst);
7208}
7209
7210static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7211{
7212	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7213	struct r600_bytecode_alu alu;
7214	int r;
7215	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7216	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
7217
7218	assert(inst->Dst[0].Register.Index < 3);
7219	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7220
7221	switch (inst->Instruction.Opcode) {
7222	case TGSI_OPCODE_ARL:
7223		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7224		break;
7225	case TGSI_OPCODE_ARR:
7226		alu.op = ALU_OP1_FLT_TO_INT;
7227		break;
7228	case TGSI_OPCODE_UARL:
7229		alu.op = ALU_OP1_MOV;
7230		break;
7231	default:
7232		assert(0);
7233		return -1;
7234	}
7235
7236	for (i = 0; i <= lasti; ++i) {
7237		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7238			continue;
7239		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7240		alu.last = i == lasti;
7241		alu.dst.sel = reg;
7242	        alu.dst.chan = i;
7243		alu.dst.write = 1;
7244		r = r600_bytecode_add_alu(ctx->bc, &alu);
7245		if (r)
7246			return r;
7247	}
7248
7249	if (inst->Dst[0].Register.Index > 0)
7250		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7251	else
7252		ctx->bc->ar_loaded = 0;
7253
7254	return 0;
7255}
7256static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7257{
7258	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7259	struct r600_bytecode_alu alu;
7260	int r;
7261	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7262
7263	switch (inst->Instruction.Opcode) {
7264	case TGSI_OPCODE_ARL:
7265		memset(&alu, 0, sizeof(alu));
7266		alu.op = ALU_OP1_FLOOR;
7267		alu.dst.sel = ctx->bc->ar_reg;
7268		alu.dst.write = 1;
7269		for (i = 0; i <= lasti; ++i) {
7270			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
7271				alu.dst.chan = i;
7272				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7273				alu.last = i == lasti;
7274				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7275					return r;
7276			}
7277		}
7278
7279		memset(&alu, 0, sizeof(alu));
7280		alu.op = ALU_OP1_FLT_TO_INT;
7281		alu.src[0].sel = ctx->bc->ar_reg;
7282		alu.dst.sel = ctx->bc->ar_reg;
7283		alu.dst.write = 1;
7284		/* FLT_TO_INT is trans-only on r600/r700 */
7285		alu.last = TRUE;
7286		for (i = 0; i <= lasti; ++i) {
7287			alu.dst.chan = i;
7288			alu.src[0].chan = i;
7289			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7290				return r;
7291		}
7292		break;
7293	case TGSI_OPCODE_ARR:
7294		memset(&alu, 0, sizeof(alu));
7295		alu.op = ALU_OP1_FLT_TO_INT;
7296		alu.dst.sel = ctx->bc->ar_reg;
7297		alu.dst.write = 1;
7298		/* FLT_TO_INT is trans-only on r600/r700 */
7299		alu.last = TRUE;
7300		for (i = 0; i <= lasti; ++i) {
7301			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7302				alu.dst.chan = i;
7303				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7304				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7305					return r;
7306			}
7307		}
7308		break;
7309	case TGSI_OPCODE_UARL:
7310		memset(&alu, 0, sizeof(alu));
7311		alu.op = ALU_OP1_MOV;
7312		alu.dst.sel = ctx->bc->ar_reg;
7313		alu.dst.write = 1;
7314		for (i = 0; i <= lasti; ++i) {
7315			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7316				alu.dst.chan = i;
7317				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7318				alu.last = i == lasti;
7319				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7320					return r;
7321			}
7322		}
7323		break;
7324	default:
7325		assert(0);
7326		return -1;
7327	}
7328
7329	ctx->bc->ar_loaded = 0;
7330	return 0;
7331}
7332
7333static int tgsi_opdst(struct r600_shader_ctx *ctx)
7334{
7335	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7336	struct r600_bytecode_alu alu;
7337	int i, r = 0;
7338
7339	for (i = 0; i < 4; i++) {
7340		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7341
7342		alu.op = ALU_OP2_MUL;
7343		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7344
7345		if (i == 0 || i == 3) {
7346			alu.src[0].sel = V_SQ_ALU_SRC_1;
7347		} else {
7348			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7349		}
7350
7351		if (i == 0 || i == 2) {
7352			alu.src[1].sel = V_SQ_ALU_SRC_1;
7353		} else {
7354			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7355		}
7356		if (i == 3)
7357			alu.last = 1;
7358		r = r600_bytecode_add_alu(ctx->bc, &alu);
7359		if (r)
7360			return r;
7361	}
7362	return 0;
7363}
7364
7365static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7366{
7367	struct r600_bytecode_alu alu;
7368	int r;
7369
7370	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7371	alu.op = opcode;
7372	alu.execute_mask = 1;
7373	alu.update_pred = 1;
7374
7375	alu.dst.sel = ctx->temp_reg;
7376	alu.dst.write = 1;
7377	alu.dst.chan = 0;
7378
7379	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7380	alu.src[1].sel = V_SQ_ALU_SRC_0;
7381	alu.src[1].chan = 0;
7382
7383	alu.last = 1;
7384
7385	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7386	if (r)
7387		return r;
7388	return 0;
7389}
7390
7391static int pops(struct r600_shader_ctx *ctx, int pops)
7392{
7393	unsigned force_pop = ctx->bc->force_add_cf;
7394
7395	if (!force_pop) {
7396		int alu_pop = 3;
7397		if (ctx->bc->cf_last) {
7398			if (ctx->bc->cf_last->op == CF_OP_ALU)
7399				alu_pop = 0;
7400			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7401				alu_pop = 1;
7402		}
7403		alu_pop += pops;
7404		if (alu_pop == 1) {
7405			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7406			ctx->bc->force_add_cf = 1;
7407		} else if (alu_pop == 2) {
7408			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7409			ctx->bc->force_add_cf = 1;
7410		} else {
7411			force_pop = 1;
7412		}
7413	}
7414
7415	if (force_pop) {
7416		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7417		ctx->bc->cf_last->pop_count = pops;
7418		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7419	}
7420
7421	return 0;
7422}
7423
7424static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7425                                              unsigned reason)
7426{
7427	struct r600_stack_info *stack = &ctx->bc->stack;
7428	unsigned elements, entries;
7429
7430	unsigned entry_size = stack->entry_size;
7431
7432	elements = (stack->loop + stack->push_wqm ) * entry_size;
7433	elements += stack->push;
7434
7435	switch (ctx->bc->chip_class) {
7436	case R600:
7437	case R700:
7438		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7439		 * the stack must be reserved to hold the current active/continue
7440		 * masks */
7441		if (reason == FC_PUSH_VPM) {
7442			elements += 2;
7443		}
7444		break;
7445
7446	case CAYMAN:
7447		/* r9xx: any stack operation on empty stack consumes 2 additional
7448		 * elements */
7449		elements += 2;
7450
7451		/* fallthrough */
7452		/* FIXME: do the two elements added above cover the cases for the
7453		 * r8xx+ below? */
7454
7455	case EVERGREEN:
7456		/* r8xx+: 2 extra elements are not always required, but one extra
7457		 * element must be added for each of the following cases:
7458		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7459		 *    stack usage.
7460		 *    (Currently we don't use ALU_ELSE_AFTER.)
7461		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7462		 *    PUSH instruction executed.
7463		 *
7464		 *    NOTE: it seems we also need to reserve additional element in some
7465		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7466		 *    then STACK_SIZE should be 2 instead of 1 */
7467		if (reason == FC_PUSH_VPM) {
7468			elements += 1;
7469		}
7470		break;
7471
7472	default:
7473		assert(0);
7474		break;
7475	}
7476
7477	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7478	 * for all chips, so we use 4 in the final formula, not the real entry_size
7479	 * for the chip */
7480	entry_size = 4;
7481
7482	entries = (elements + (entry_size - 1)) / entry_size;
7483
7484	if (entries > stack->max_entries)
7485		stack->max_entries = entries;
7486}
7487
7488static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7489{
7490	switch(reason) {
7491	case FC_PUSH_VPM:
7492		--ctx->bc->stack.push;
7493		assert(ctx->bc->stack.push >= 0);
7494		break;
7495	case FC_PUSH_WQM:
7496		--ctx->bc->stack.push_wqm;
7497		assert(ctx->bc->stack.push_wqm >= 0);
7498		break;
7499	case FC_LOOP:
7500		--ctx->bc->stack.loop;
7501		assert(ctx->bc->stack.loop >= 0);
7502		break;
7503	default:
7504		assert(0);
7505		break;
7506	}
7507}
7508
7509static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7510{
7511	switch (reason) {
7512	case FC_PUSH_VPM:
7513		++ctx->bc->stack.push;
7514		break;
7515	case FC_PUSH_WQM:
7516		++ctx->bc->stack.push_wqm;
7517	case FC_LOOP:
7518		++ctx->bc->stack.loop;
7519		break;
7520	default:
7521		assert(0);
7522	}
7523
7524	callstack_update_max_depth(ctx, reason);
7525}
7526
7527static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7528{
7529	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7530
7531	sp->mid = realloc((void *)sp->mid,
7532						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7533	sp->mid[sp->num_mid] = ctx->bc->cf_last;
7534	sp->num_mid++;
7535}
7536
7537static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7538{
7539	ctx->bc->fc_sp++;
7540	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7541	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7542}
7543
7544static void fc_poplevel(struct r600_shader_ctx *ctx)
7545{
7546	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7547	free(sp->mid);
7548	sp->mid = NULL;
7549	sp->num_mid = 0;
7550	sp->start = NULL;
7551	sp->type = 0;
7552	ctx->bc->fc_sp--;
7553}
7554
7555#if 0
7556static int emit_return(struct r600_shader_ctx *ctx)
7557{
7558	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7559	return 0;
7560}
7561
7562static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7563{
7564
7565	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7566	ctx->bc->cf_last->pop_count = pops;
7567	/* XXX work out offset */
7568	return 0;
7569}
7570
7571static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7572{
7573	return 0;
7574}
7575
7576static void emit_testflag(struct r600_shader_ctx *ctx)
7577{
7578
7579}
7580
7581static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7582{
7583	emit_testflag(ctx);
7584	emit_jump_to_offset(ctx, 1, 4);
7585	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7586	pops(ctx, ifidx + 1);
7587	emit_return(ctx);
7588}
7589
7590static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7591{
7592	emit_testflag(ctx);
7593
7594	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7595	ctx->bc->cf_last->pop_count = 1;
7596
7597	fc_set_mid(ctx, fc_sp);
7598
7599	pops(ctx, 1);
7600}
7601#endif
7602
7603static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7604{
7605	int alu_type = CF_OP_ALU_PUSH_BEFORE;
7606
7607	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7608	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7609	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7610	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7611	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7612		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7613		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7614		alu_type = CF_OP_ALU;
7615	}
7616
7617	emit_logic_pred(ctx, opcode, alu_type);
7618
7619	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7620
7621	fc_pushlevel(ctx, FC_IF);
7622
7623	callstack_push(ctx, FC_PUSH_VPM);
7624	return 0;
7625}
7626
7627static int tgsi_if(struct r600_shader_ctx *ctx)
7628{
7629	return emit_if(ctx, ALU_OP2_PRED_SETNE);
7630}
7631
7632static int tgsi_uif(struct r600_shader_ctx *ctx)
7633{
7634	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7635}
7636
7637static int tgsi_else(struct r600_shader_ctx *ctx)
7638{
7639	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7640	ctx->bc->cf_last->pop_count = 1;
7641
7642	fc_set_mid(ctx, ctx->bc->fc_sp);
7643	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7644	return 0;
7645}
7646
7647static int tgsi_endif(struct r600_shader_ctx *ctx)
7648{
7649	pops(ctx, 1);
7650	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7651		R600_ERR("if/endif unbalanced in shader\n");
7652		return -1;
7653	}
7654
7655	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7656		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7657		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7658	} else {
7659		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7660	}
7661	fc_poplevel(ctx);
7662
7663	callstack_pop(ctx, FC_PUSH_VPM);
7664	return 0;
7665}
7666
7667static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7668{
7669	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7670	 * limited to 4096 iterations, like the other LOOP_* instructions. */
7671	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7672
7673	fc_pushlevel(ctx, FC_LOOP);
7674
7675	/* check stack depth */
7676	callstack_push(ctx, FC_LOOP);
7677	return 0;
7678}
7679
7680static int tgsi_endloop(struct r600_shader_ctx *ctx)
7681{
7682	int i;
7683
7684	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7685
7686	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7687		R600_ERR("loop/endloop in shader code are not paired.\n");
7688		return -EINVAL;
7689	}
7690
7691	/* fixup loop pointers - from r600isa
7692	   LOOP END points to CF after LOOP START,
7693	   LOOP START point to CF after LOOP END
7694	   BRK/CONT point to LOOP END CF
7695	*/
7696	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7697
7698	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7699
7700	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7701		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7702	}
7703	/* XXX add LOOPRET support */
7704	fc_poplevel(ctx);
7705	callstack_pop(ctx, FC_LOOP);
7706	return 0;
7707}
7708
7709static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7710{
7711	int r;
7712	unsigned int fscp;
7713
7714	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7715	{
7716		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7717			break;
7718	}
7719	if (fscp == 0) {
7720		R600_ERR("BREAKC not inside loop/endloop pair\n");
7721		return -EINVAL;
7722	}
7723
7724	if (ctx->bc->chip_class == EVERGREEN &&
7725	    ctx->bc->family != CHIP_CYPRESS &&
7726	    ctx->bc->family != CHIP_JUNIPER) {
7727		/* HW bug: ALU_BREAK does not save the active mask correctly */
7728		r = tgsi_uif(ctx);
7729		if (r)
7730			return r;
7731
7732		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7733		if (r)
7734			return r;
7735		fc_set_mid(ctx, fscp);
7736
7737		return tgsi_endif(ctx);
7738	} else {
7739		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7740		if (r)
7741			return r;
7742		fc_set_mid(ctx, fscp);
7743	}
7744
7745	return 0;
7746}
7747
7748static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7749{
7750	unsigned int fscp;
7751
7752	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7753	{
7754		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7755			break;
7756	}
7757
7758	if (fscp == 0) {
7759		R600_ERR("Break not inside loop/endloop pair\n");
7760		return -EINVAL;
7761	}
7762
7763	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7764
7765	fc_set_mid(ctx, fscp);
7766
7767	return 0;
7768}
7769
7770static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7771{
7772	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7773	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
7774	int r;
7775
7776	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7777		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
7778
7779	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7780	if (!r) {
7781		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
7782		return emit_inc_ring_offset(ctx, stream, TRUE);
7783	}
7784	return r;
7785}
7786
7787static int tgsi_umad(struct r600_shader_ctx *ctx)
7788{
7789	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7790	struct r600_bytecode_alu alu;
7791	int i, j, k, r;
7792	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7793
7794	/* src0 * src1 */
7795	for (i = 0; i < lasti + 1; i++) {
7796		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7797			continue;
7798
7799		if (ctx->bc->chip_class == CAYMAN) {
7800			for (j = 0 ; j < 4; j++) {
7801				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7802
7803				alu.op = ALU_OP2_MULLO_UINT;
7804				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7805					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7806				}
7807				alu.dst.chan = j;
7808				alu.dst.sel = ctx->temp_reg;
7809				alu.dst.write = (j == i);
7810				if (j == 3)
7811					alu.last = 1;
7812				r = r600_bytecode_add_alu(ctx->bc, &alu);
7813				if (r)
7814					return r;
7815			}
7816		} else {
7817			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7818
7819			alu.dst.chan = i;
7820			alu.dst.sel = ctx->temp_reg;
7821			alu.dst.write = 1;
7822
7823			alu.op = ALU_OP2_MULLO_UINT;
7824			for (j = 0; j < 2; j++) {
7825				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7826			}
7827
7828			alu.last = 1;
7829			r = r600_bytecode_add_alu(ctx->bc, &alu);
7830			if (r)
7831				return r;
7832		}
7833	}
7834
7835
7836	for (i = 0; i < lasti + 1; i++) {
7837		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7838			continue;
7839
7840		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7841		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7842
7843		alu.op = ALU_OP2_ADD_INT;
7844
7845		alu.src[0].sel = ctx->temp_reg;
7846		alu.src[0].chan = i;
7847
7848		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7849		if (i == lasti) {
7850			alu.last = 1;
7851		}
7852		r = r600_bytecode_add_alu(ctx->bc, &alu);
7853		if (r)
7854			return r;
7855	}
7856	return 0;
7857}
7858
7859static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7860	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
7861	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
7862	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
7863
7864	/* XXX:
7865	 * For state trackers other than OpenGL, we'll want to use
7866	 * _RECIP_IEEE instead.
7867	 */
7868	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7869
7870	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
7871	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
7872	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
7873	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
7874	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
7875	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
7876	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
7877	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
7878	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
7879	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
7880	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
7881	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
7882	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
7883	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
7884	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
7885	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
7886	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7887	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
7888	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
7889	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
7890	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
7891	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
7892	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
7893	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
7894	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7895	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7896	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
7897	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
7898	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
7899	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
7900	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
7901	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
7902	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
7903	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7904	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7905	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
7906	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7907	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
7908	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
7909	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
7910	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
7911	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
7912	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
7913	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
7914	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
7915	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
7916	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
7917	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
7918	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
7919	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
7920	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
7921	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
7922	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
7923	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
7924	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
7925	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
7926	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
7927	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
7928	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
7929	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
7930	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
7931	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
7932	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
7933	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
7934	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
7935	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
7936	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
7937	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
7938	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
7939	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7940	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
7941	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
7942	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
7943	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
7944	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
7945	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
7946	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
7947	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
7948	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
7949	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
7950	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7951	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
7952	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
7953	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7954	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
7955	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
7956	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
7957	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
7958	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
7959	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
7960	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
7961	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7962	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7963	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7964	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7965	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
7966	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
7967	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
7968	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
7969	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7970	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
7971	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
7972	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
7973	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
7974	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
7975	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
7976	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7977	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7978	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
7979	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
7980	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
7981	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
7982	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
7983	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
7984	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
7985	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
7986	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
7987	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
7988	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
7989	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
7990	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
7991	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
7992	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7993	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
7994	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7995	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
7996	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
7997	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
7998	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
7999	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8000	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8001	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8002	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8003	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8004	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
8005	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8006	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
8007	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8008	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8009	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8010	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8011	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8012	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8013	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8014	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8015	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8016	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8017	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8018	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8019	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8020	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8021	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8022	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8023	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8024	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8025	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8026	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8027	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8028	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8029	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8030	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8031	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8032	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8033	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8034	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8035	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8036	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8037	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8038	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8039	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8040	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8041	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8042	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8043	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8044	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8045	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8046	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8047	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8048	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
8049	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
8050	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
8051	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
8052	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
8053	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
8054	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
8055	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
8056	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
8057	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8058	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
8059	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
8060	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
8061	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8062};
8063
8064static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8065	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8066	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8067	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8068	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8069	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8070	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8071	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8072	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8073	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8074	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8075	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8076	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8077	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8078	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8079	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8080	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8081	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8082	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8083	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8084	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8085	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8086	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8087	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8088	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8089	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8090	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8091	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8092	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8093	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8094	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8095	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8096	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8097	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8098	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8099	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8100	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8101	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
8102	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8103	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8104	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8105	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8106	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8107	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8108	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8109	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8110	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8111	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8112	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8113	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
8114	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8115	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8116	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8117	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8118	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8119	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8120	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8121	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8122	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8123	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8124	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8125	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8126	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8127	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8128	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8129	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8130	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8131	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8132	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8133	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8134	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8135	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8136	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8137	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8138	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8139	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8140	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8141	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8142	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8143	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8144	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8145	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8146	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8147	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8148	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8149	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8150	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8151	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8152	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8153	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8154	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8155	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8156	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8157	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8158	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8159	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8160	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8161	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8162	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8163	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8164	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8165	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8166	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8167	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8168	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8169	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8170	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8171	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8172	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8173	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8174	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8175	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8176	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8177	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8178	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8179	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8180	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8181	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8182	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8183	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8184	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8185	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8186	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8187	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8188	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8189	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8190	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8191	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8192	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8193	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8194	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8195	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8196	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8197	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8198	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8199	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8200	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8201	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8202	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8203	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8204	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8205	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8206	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8207	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8208	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8209	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8210	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8211	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8212	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8213	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8214	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8215	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8216	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8217	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8218	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8219	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8220	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8221	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8222	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8223	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8224	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8225	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8226	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8227	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8228	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8229	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8230	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8231	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8232	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8233	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8234	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8235	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8236	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8237	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8238	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8239	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8240	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8241	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8242	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8243	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8244	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8245	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8246	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8247	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8248	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8249	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8250	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8251	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8252	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8253	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8254	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8255	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8256	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8257	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8258	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8259	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8260	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8261	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8262	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8263	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8264	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8265	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8266	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8267	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8268	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8269	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8270	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8271	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8272	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8273	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8274	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8275	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8276	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8277	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8278	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8279	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8280	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8281	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8282	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8283	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8284};
8285
8286static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8287	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8288	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8289	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8290	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8291	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8292	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8293	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8294	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8295	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8296	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8297	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8298	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8299	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8300	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8301	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8302	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8303	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8304	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8305	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8306	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8307	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8308	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8309	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8310	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8311	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8312	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8313	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8314	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8315	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8316	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8317	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
8318	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8319	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8320	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8321	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8322	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8323	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
8324	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8325	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8326	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8327	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8328	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8329	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8330	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8331	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8332	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8333	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8334	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8335	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
8336	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8337	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8338	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8339	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8340	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8341	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8342	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8343	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8344	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8345	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8346	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8347	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8348	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8349	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8350	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8351	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8352	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8353	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8354	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8355	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8356	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8357	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8358	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8359	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8360	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8361	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8362	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8363	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8364	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8365	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8366	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8367	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8368	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8369	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8370	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8371	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
8372	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8373	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8374	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8375	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8376	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8377	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8378	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8379	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8380	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8381	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8382	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8383	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8384	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8385	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8386	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8387	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8388	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8389	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8390	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8391	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8392	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8393	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8394	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8395	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8396	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8397	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8398	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8399	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8400	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8401	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8402	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8403	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8404	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8405	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8406	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
8407	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8408	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8409	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8410	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8411	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8412	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8413	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8414	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8415	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8416	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8417	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8418	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8419	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8420	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8421	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8422	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8423	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8424	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8425	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8426	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8427	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8428	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8429	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8430	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8431	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8432	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8433	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8434	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8435	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8436	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8437	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8438	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8439	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8440	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8441	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8442	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8443	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8444	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8445	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8446	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8447	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8448	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8449	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8450	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8451	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8452	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8453	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8454	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8455	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8456	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8457	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8458	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8459	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8460	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8461	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8462	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8463	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8464	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8465	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8466	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8467	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8468	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8469	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8470	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8471	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8472	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8473	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8474	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8475	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8476	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8477	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8478	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8479	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8480	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8481	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8482	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8483	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8484	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8485	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8486	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8487	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8488	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8489	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8490	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8491	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8492	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8493	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8494	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8495	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8496	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8497	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8498	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8499	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8500	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8501	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8502	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8503	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8504	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8505	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8506};
8507