ir3_compiler.c revision 80058c0f08ea94d3de96909027a792e397fa9262
1103ccde8fe2f2c8abde914a8ba736b2e9cb8d20bElliott Hughes/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2103ccde8fe2f2c8abde914a8ba736b2e9cb8d20bElliott Hughes
3ed74484dcbc2e156a6e5fa861a62425b12e55128Elliott Hughes/*
4e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
5e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes *
6e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * Permission is hereby granted, free of charge, to any person obtaining a
7e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * copy of this software and associated documentation files (the "Software"),
8e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * to deal in the Software without restriction, including without limitation
9e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * and/or sell copies of the Software, and to permit persons to whom the
11e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * Software is furnished to do so, subject to the following conditions:
127efad83d430f4d824f2aaa75edea5106f6ff8aaeElliott Hughes *
13e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * The above copyright notice and this permission notice (including the next
14e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * paragraph) shall be included in all copies or substantial portions of the
15e4ffd9f2341f42c9281b4a93df76768580535eddElliott Hughes * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 *    Rob Clark <robclark@freedesktop.org>
27 */
28
29#include <stdarg.h>
30
31#include "pipe/p_state.h"
32#include "util/u_string.h"
33#include "util/u_memory.h"
34#include "util/u_inlines.h"
35#include "tgsi/tgsi_parse.h"
36#include "tgsi/tgsi_ureg.h"
37#include "tgsi/tgsi_info.h"
38#include "tgsi/tgsi_strings.h"
39#include "tgsi/tgsi_dump.h"
40#include "tgsi/tgsi_scan.h"
41
42#include "freedreno_lowering.h"
43#include "freedreno_util.h"
44
45#include "ir3_compiler.h"
46#include "ir3_shader.h"
47
48#include "instr-a3xx.h"
49#include "ir3.h"
50
51struct ir3_compile_context {
52	const struct tgsi_token *tokens;
53	bool free_tokens;
54	struct ir3 *ir;
55	struct ir3_shader_variant *so;
56
57	struct ir3_block *block;
58	struct ir3_instruction *current_instr;
59
60	/* we need to defer updates to block->outputs[] until the end
61	 * of an instruction (so we don't see new value until *after*
62	 * the src registers are processed)
63	 */
64	struct {
65		struct ir3_instruction *instr, **instrp;
66	} output_updates[16];
67	unsigned num_output_updates;
68
69	/* are we in a sequence of "atomic" instructions?
70	 */
71	bool atomic;
72
73	/* For fragment shaders, from the hw perspective the only
74	 * actual input is r0.xy position register passed to bary.f.
75	 * But TGSI doesn't know that, it still declares things as
76	 * IN[] registers.  So we do all the input tracking normally
77	 * and fix things up after compile_instructions()
78	 *
79	 * NOTE that frag_pos is the hardware position (possibly it
80	 * is actually an index or tag or some such.. it is *not*
81	 * values that can be directly used for gl_FragCoord..)
82	 */
83	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
84
85	struct tgsi_parse_context parser;
86	unsigned type;
87
88	struct tgsi_shader_info info;
89
90	/* for calculating input/output positions/linkages: */
91	unsigned next_inloc;
92
93	unsigned num_internal_temps;
94	struct tgsi_src_register internal_temps[6];
95
96	/* idx/slot for last compiler generated immediate */
97	unsigned immediate_idx;
98
99	/* stack of branch instructions that mark (potentially nested)
100	 * branch if/else/loop/etc
101	 */
102	struct {
103		struct ir3_instruction *instr, *cond;
104		bool inv;   /* true iff in else leg of branch */
105	} branch[16];
106	unsigned int branch_count;
107
108	/* list of kill instructions: */
109	struct ir3_instruction *kill[16];
110	unsigned int kill_count;
111
112	/* used when dst is same as one of the src, to avoid overwriting a
113	 * src element before the remaining scalar instructions that make
114	 * up the vector operation
115	 */
116	struct tgsi_dst_register tmp_dst;
117	struct tgsi_src_register *tmp_src;
118};
119
120
121static void vectorize(struct ir3_compile_context *ctx,
122		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
123		int nsrcs, ...);
124static void create_mov(struct ir3_compile_context *ctx,
125		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
126static type_t get_ftype(struct ir3_compile_context *ctx);
127
128static unsigned
129compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
130		const struct tgsi_token *tokens)
131{
132	unsigned ret;
133	struct tgsi_shader_info *info = &ctx->info;
134	const struct fd_lowering_config lconfig = {
135			.color_two_side = so->key.color_two_side,
136			.lower_DST  = true,
137			.lower_XPD  = true,
138			.lower_SCS  = true,
139			.lower_LRP  = true,
140			.lower_FRC  = true,
141			.lower_POW  = true,
142			.lower_LIT  = true,
143			.lower_EXP  = true,
144			.lower_LOG  = true,
145			.lower_DP4  = true,
146			.lower_DP3  = true,
147			.lower_DPH  = true,
148			.lower_DP2  = true,
149			.lower_DP2A = true,
150	};
151
152	ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
153	ctx->free_tokens = !!ctx->tokens;
154	if (!ctx->tokens) {
155		/* no lowering */
156		ctx->tokens = tokens;
157	}
158	ctx->ir = so->ir;
159	ctx->so = so;
160	ctx->next_inloc = 8;
161	ctx->num_internal_temps = 0;
162	ctx->branch_count = 0;
163	ctx->kill_count = 0;
164	ctx->block = NULL;
165	ctx->current_instr = NULL;
166	ctx->num_output_updates = 0;
167	ctx->atomic = false;
168	ctx->frag_pos = NULL;
169	ctx->frag_face = NULL;
170
171	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
172
173#define FM(x) (1 << TGSI_FILE_##x)
174	/* optimize can't deal with relative addressing: */
175	if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
176		return TGSI_PARSE_ERROR;
177
178	/* NOTE: if relative addressing is used, we set constlen in
179	 * the compiler (to worst-case value) since we don't know in
180	 * the assembler what the max addr reg value can be:
181	 */
182	if (info->indirect_files & FM(CONSTANT))
183		so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1);
184
185	/* Immediates go after constants: */
186	so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
187	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
188
189	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
190	if (ret != TGSI_PARSE_OK)
191		return ret;
192
193	ctx->type = ctx->parser.FullHeader.Processor.Processor;
194
195	return ret;
196}
197
198static void
199compile_error(struct ir3_compile_context *ctx, const char *format, ...)
200{
201	va_list ap;
202	va_start(ap, format);
203	_debug_vprintf(format, ap);
204	va_end(ap);
205	tgsi_dump(ctx->tokens, 0);
206	debug_assert(0);
207}
208
209#define compile_assert(ctx, cond) do { \
210		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
211	} while (0)
212
213static void
214compile_free(struct ir3_compile_context *ctx)
215{
216	if (ctx->free_tokens)
217		free((void *)ctx->tokens);
218	tgsi_parse_free(&ctx->parser);
219}
220
221struct instr_translater {
222	void (*fxn)(const struct instr_translater *t,
223			struct ir3_compile_context *ctx,
224			struct tgsi_full_instruction *inst);
225	unsigned tgsi_opc;
226	opc_t opc;
227	opc_t hopc;    /* opc to use for half_precision mode, if different */
228	unsigned arg;
229};
230
231static void
232instr_finish(struct ir3_compile_context *ctx)
233{
234	unsigned i;
235
236	if (ctx->atomic)
237		return;
238
239	for (i = 0; i < ctx->num_output_updates; i++)
240		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
241
242	ctx->num_output_updates = 0;
243}
244
245/* For "atomic" groups of instructions, for example the four scalar
246 * instructions to perform a vec4 operation.  Basically this just
247 * blocks out handling of output_updates so the next scalar instruction
248 * still sees the result from before the start of the atomic group.
249 *
250 * NOTE: when used properly, this could probably replace get/put_dst()
251 * stuff.
252 */
253static void
254instr_atomic_start(struct ir3_compile_context *ctx)
255{
256	ctx->atomic = true;
257}
258
259static void
260instr_atomic_end(struct ir3_compile_context *ctx)
261{
262	ctx->atomic = false;
263	instr_finish(ctx);
264}
265
266static struct ir3_instruction *
267instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
268{
269	instr_finish(ctx);
270	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
271}
272
273static struct ir3_instruction *
274instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
275{
276	instr_finish(ctx);
277	return (ctx->current_instr = ir3_instr_clone(instr));
278}
279
280static struct ir3_block *
281push_block(struct ir3_compile_context *ctx)
282{
283	struct ir3_block *block;
284	unsigned ntmp, nin, nout;
285
286#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
287
288	/* hmm, give ourselves room to create 4 extra temporaries (vec4):
289	 */
290	ntmp = SCALAR_REGS(TEMPORARY);
291	ntmp += 4 * 4;
292
293	nout = SCALAR_REGS(OUTPUT);
294	nin  = SCALAR_REGS(INPUT);
295
296	/* for outermost block, 'inputs' are the actual shader INPUT
297	 * register file.  Reads from INPUT registers always go back to
298	 * top block.  For nested blocks, 'inputs' is used to track any
299	 * TEMPORARY file register from one of the enclosing blocks that
300	 * is ready in this block.
301	 */
302	if (!ctx->block) {
303		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
304		 * position)
305		 */
306		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
307			int n = 2;
308			if (ctx->info.reads_position)
309				n += 4;
310			if (ctx->info.uses_frontface)
311				n += 4;
312			nin = MAX2(n, nin);
313			nout += ARRAY_SIZE(ctx->kill);
314		}
315	} else {
316		nin = ntmp;
317	}
318
319	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
320
321	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
322		block->noutputs -= ARRAY_SIZE(ctx->kill);
323
324	block->parent = ctx->block;
325	ctx->block = block;
326
327	return block;
328}
329
330static void
331pop_block(struct ir3_compile_context *ctx)
332{
333	ctx->block = ctx->block->parent;
334	compile_assert(ctx, ctx->block);
335}
336
337static struct ir3_instruction *
338create_output(struct ir3_block *block, struct ir3_instruction *instr,
339		unsigned n)
340{
341	struct ir3_instruction *out;
342
343	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
344	out->inout.block = block;
345	ir3_reg_create(out, n, 0);
346	if (instr)
347		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
348
349	return out;
350}
351
352static struct ir3_instruction *
353create_input(struct ir3_block *block, struct ir3_instruction *instr,
354		unsigned n)
355{
356	struct ir3_instruction *in;
357
358	in = ir3_instr_create(block, -1, OPC_META_INPUT);
359	in->inout.block = block;
360	ir3_reg_create(in, n, 0);
361	if (instr)
362		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
363
364	return in;
365}
366
367static struct ir3_instruction *
368block_input(struct ir3_block *block, unsigned n)
369{
370	/* references to INPUT register file always go back up to
371	 * top level:
372	 */
373	if (block->parent)
374		return block_input(block->parent, n);
375	return block->inputs[n];
376}
377
378/* return temporary in scope, creating if needed meta-input node
379 * to track block inputs
380 */
381static struct ir3_instruction *
382block_temporary(struct ir3_block *block, unsigned n)
383{
384	/* references to TEMPORARY register file, find the nearest
385	 * enclosing block which has already assigned this temporary,
386	 * creating meta-input instructions along the way to keep
387	 * track of block inputs
388	 */
389	if (block->parent && !block->temporaries[n]) {
390		/* if already have input for this block, reuse: */
391		if (!block->inputs[n])
392			block->inputs[n] = block_temporary(block->parent, n);
393
394		/* and create new input to return: */
395		return create_input(block, block->inputs[n], n);
396	}
397	return block->temporaries[n];
398}
399
400static struct ir3_instruction *
401create_immed(struct ir3_compile_context *ctx, float val)
402{
403	/* NOTE: *don't* use instr_create() here!
404	 */
405	struct ir3_instruction *instr;
406	instr = ir3_instr_create(ctx->block, 1, 0);
407	instr->cat1.src_type = get_ftype(ctx);
408	instr->cat1.dst_type = get_ftype(ctx);
409	ir3_reg_create(instr, 0, 0);
410	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
411	return instr;
412}
413
414static void
415ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
416		const struct tgsi_dst_register *dst, unsigned chan)
417{
418	unsigned n = regid(dst->Index, chan);
419	unsigned idx = ctx->num_output_updates;
420
421	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
422
423	/* NOTE: defer update of temporaries[idx] or output[idx]
424	 * until instr_finish(), so that if the current instruction
425	 * reads the same TEMP/OUT[] it gets the old value:
426	 *
427	 * bleh.. this might be a bit easier to just figure out
428	 * in instr_finish().  But at that point we've already
429	 * lost information about OUTPUT vs TEMPORARY register
430	 * file..
431	 */
432
433	switch (dst->File) {
434	case TGSI_FILE_OUTPUT:
435		compile_assert(ctx, n < ctx->block->noutputs);
436		ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
437		ctx->output_updates[idx].instr = instr;
438		ctx->num_output_updates++;
439		break;
440	case TGSI_FILE_TEMPORARY:
441		compile_assert(ctx, n < ctx->block->ntemporaries);
442		ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
443		ctx->output_updates[idx].instr = instr;
444		ctx->num_output_updates++;
445		break;
446	case TGSI_FILE_ADDRESS:
447		compile_assert(ctx, n < 1);
448		ctx->output_updates[idx].instrp = &ctx->block->address;
449		ctx->output_updates[idx].instr = instr;
450		ctx->num_output_updates++;
451		break;
452	}
453}
454
455static void
456ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
457		const struct tgsi_src_register *src, unsigned chan)
458{
459	struct ir3_block *block = ctx->block;
460	unsigned n = regid(src->Index, chan);
461
462	switch (src->File) {
463	case TGSI_FILE_INPUT:
464		reg->flags |= IR3_REG_SSA;
465		reg->instr = block_input(ctx->block, n);
466		break;
467	case TGSI_FILE_OUTPUT:
468		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
469		 * for the following clamp instructions:
470		 */
471		reg->flags |= IR3_REG_SSA;
472		reg->instr = block->outputs[n];
473		/* we don't have to worry about read from an OUTPUT that was
474		 * assigned outside of the current block, because the _SAT
475		 * clamp instructions will always be in the same block as
476		 * the original instruction which wrote the OUTPUT
477		 */
478		compile_assert(ctx, reg->instr);
479		break;
480	case TGSI_FILE_TEMPORARY:
481		reg->flags |= IR3_REG_SSA;
482		reg->instr = block_temporary(ctx->block, n);
483		break;
484	}
485
486	if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
487		/* this can happen when registers (or components of a TGSI
488		 * register) are used as src before they have been assigned
489		 * (undefined contents).  To avoid confusing the rest of the
490		 * compiler, and to generally keep things peachy, substitute
491		 * an instruction that sets the src to 0.0.  Or to keep
492		 * things undefined, I could plug in a random number? :-P
493		 *
494		 * NOTE: *don't* use instr_create() here!
495		 */
496		reg->instr = create_immed(ctx, 0.0);
497	}
498}
499
500static struct ir3_register *
501add_dst_reg_wrmask(struct ir3_compile_context *ctx,
502		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
503		unsigned chan, unsigned wrmask)
504{
505	unsigned flags = 0, num = 0;
506	struct ir3_register *reg;
507
508	switch (dst->File) {
509	case TGSI_FILE_OUTPUT:
510	case TGSI_FILE_TEMPORARY:
511		/* uses SSA */
512		break;
513	case TGSI_FILE_ADDRESS:
514		flags |= IR3_REG_ADDR;
515		/* uses SSA */
516		break;
517	default:
518		compile_error(ctx, "unsupported dst register file: %s\n",
519			tgsi_file_name(dst->File));
520		break;
521	}
522
523	if (dst->Indirect)
524		flags |= IR3_REG_RELATIV;
525
526	reg = ir3_reg_create(instr, regid(num, chan), flags);
527
528	/* NOTE: do not call ssa_dst() if atomic.. vectorize()
529	 * itself will call ssa_dst().  This is to filter out
530	 * the (initially bogus) .x component dst which is
531	 * created (but not necessarily used, ie. if the net
532	 * result of the vector operation does not write to
533	 * the .x component)
534	 */
535
536	reg->wrmask = wrmask;
537	if (wrmask == 0x1) {
538		/* normal case */
539		if (!ctx->atomic)
540			ssa_dst(ctx, instr, dst, chan);
541	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
542			(dst->File == TGSI_FILE_OUTPUT) ||
543			(dst->File == TGSI_FILE_ADDRESS)) {
544		unsigned i;
545
546		/* if instruction writes multiple, we need to create
547		 * some place-holder collect the registers:
548		 */
549		for (i = 0; i < 4; i++) {
550			if (wrmask & (1 << i)) {
551				struct ir3_instruction *collect =
552						ir3_instr_create(ctx->block, -1, OPC_META_FO);
553				collect->fo.off = i;
554				/* unused dst reg: */
555				ir3_reg_create(collect, 0, 0);
556				/* and src reg used to hold original instr */
557				ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
558				if (!ctx->atomic)
559					ssa_dst(ctx, collect, dst, chan+i);
560			}
561		}
562	}
563
564	return reg;
565}
566
567static struct ir3_register *
568add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
569		const struct tgsi_dst_register *dst, unsigned chan)
570{
571	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
572}
573
574static struct ir3_register *
575add_src_reg_wrmask(struct ir3_compile_context *ctx,
576		struct ir3_instruction *instr, const struct tgsi_src_register *src,
577		unsigned chan, unsigned wrmask)
578{
579	unsigned flags = 0, num = 0;
580	struct ir3_register *reg;
581	struct ir3_instruction *orig = NULL;
582
583	/* TODO we need to use a mov to temp for const >= 64.. or maybe
584	 * we could use relative addressing..
585	 */
586	compile_assert(ctx, src->Index < 64);
587
588	switch (src->File) {
589	case TGSI_FILE_IMMEDIATE:
590		/* TODO if possible, use actual immediate instead of const.. but
591		 * TGSI has vec4 immediates, we can only embed scalar (of limited
592		 * size, depending on instruction..)
593		 */
594		flags |= IR3_REG_CONST;
595		num = src->Index + ctx->so->first_immediate;
596		break;
597	case TGSI_FILE_CONSTANT:
598		flags |= IR3_REG_CONST;
599		num = src->Index;
600		break;
601	case TGSI_FILE_OUTPUT:
602		/* NOTE: we should only end up w/ OUTPUT file for things like
603		 * clamp()'ing saturated dst instructions
604		 */
605	case TGSI_FILE_INPUT:
606	case TGSI_FILE_TEMPORARY:
607		/* uses SSA */
608		break;
609	default:
610		compile_error(ctx, "unsupported src register file: %s\n",
611			tgsi_file_name(src->File));
612		break;
613	}
614
615	if (src->Absolute)
616		flags |= IR3_REG_ABS;
617	if (src->Negate)
618		flags |= IR3_REG_NEGATE;
619
620	if (src->Indirect) {
621		flags |= IR3_REG_RELATIV;
622
623		/* shouldn't happen, and we can't cope with it below: */
624		compile_assert(ctx, wrmask == 0x1);
625
626		/* wrap in a meta-deref to track both the src and address: */
627		orig = instr;
628
629		instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
630		ir3_reg_create(instr, 0, 0);
631		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
632	}
633
634	reg = ir3_reg_create(instr, regid(num, chan), flags);
635
636	reg->wrmask = wrmask;
637	if (wrmask == 0x1) {
638		/* normal case */
639		ssa_src(ctx, reg, src, chan);
640	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
641			(src->File == TGSI_FILE_OUTPUT) ||
642			(src->File == TGSI_FILE_INPUT)) {
643		struct ir3_instruction *collect;
644		unsigned i;
645
646		compile_assert(ctx, !src->Indirect);
647
648		/* if instruction reads multiple, we need to create
649		 * some place-holder collect the registers:
650		 */
651		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
652		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
653
654		for (i = 0; i < 4; i++) {
655			if (wrmask & (1 << i)) {
656				/* and src reg used point to the original instr */
657				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
658						src, chan + i);
659			} else if (wrmask & ~((i << i) - 1)) {
660				/* if any remaining components, then dummy
661				 * placeholder src reg to fill in the blanks:
662				 */
663				ir3_reg_create(collect, 0, 0);
664			}
665		}
666
667		reg->flags |= IR3_REG_SSA;
668		reg->instr = collect;
669	}
670
671	if (src->Indirect) {
672		reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
673		reg->instr = instr;
674	}
675	return reg;
676}
677
678static struct ir3_register *
679add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
680		const struct tgsi_src_register *src, unsigned chan)
681{
682	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
683}
684
685static void
686src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
687{
688	src->File      = dst->File;
689	src->Indirect  = dst->Indirect;
690	src->Dimension = dst->Dimension;
691	src->Index     = dst->Index;
692	src->Absolute  = 0;
693	src->Negate    = 0;
694	src->SwizzleX  = TGSI_SWIZZLE_X;
695	src->SwizzleY  = TGSI_SWIZZLE_Y;
696	src->SwizzleZ  = TGSI_SWIZZLE_Z;
697	src->SwizzleW  = TGSI_SWIZZLE_W;
698}
699
700/* Get internal-temp src/dst to use for a sequence of instructions
701 * generated by a single TGSI op.
702 */
703static struct tgsi_src_register *
704get_internal_temp(struct ir3_compile_context *ctx,
705		struct tgsi_dst_register *tmp_dst)
706{
707	struct tgsi_src_register *tmp_src;
708	int n;
709
710	tmp_dst->File      = TGSI_FILE_TEMPORARY;
711	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
712	tmp_dst->Indirect  = 0;
713	tmp_dst->Dimension = 0;
714
715	/* assign next temporary: */
716	n = ctx->num_internal_temps++;
717	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
718	tmp_src = &ctx->internal_temps[n];
719
720	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
721
722	src_from_dst(tmp_src, tmp_dst);
723
724	return tmp_src;
725}
726
727static inline bool
728is_const(struct tgsi_src_register *src)
729{
730	return (src->File == TGSI_FILE_CONSTANT) ||
731			(src->File == TGSI_FILE_IMMEDIATE);
732}
733
734static inline bool
735is_relative(struct tgsi_src_register *src)
736{
737	return src->Indirect;
738}
739
740static inline bool
741is_rel_or_const(struct tgsi_src_register *src)
742{
743	return is_relative(src) || is_const(src);
744}
745
746static type_t
747get_ftype(struct ir3_compile_context *ctx)
748{
749	return TYPE_F32;
750}
751
752static type_t
753get_utype(struct ir3_compile_context *ctx)
754{
755	return TYPE_U32;
756}
757
758static unsigned
759src_swiz(struct tgsi_src_register *src, int chan)
760{
761	switch (chan) {
762	case 0: return src->SwizzleX;
763	case 1: return src->SwizzleY;
764	case 2: return src->SwizzleZ;
765	case 3: return src->SwizzleW;
766	}
767	assert(0);
768	return 0;
769}
770
771/* for instructions that cannot take a const register as src, if needed
772 * generate a move to temporary gpr:
773 */
774static struct tgsi_src_register *
775get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
776{
777	struct tgsi_dst_register tmp_dst;
778	struct tgsi_src_register *tmp_src;
779
780	compile_assert(ctx, is_rel_or_const(src));
781
782	tmp_src = get_internal_temp(ctx, &tmp_dst);
783
784	create_mov(ctx, &tmp_dst, src);
785
786	return tmp_src;
787}
788
789static void
790get_immediate(struct ir3_compile_context *ctx,
791		struct tgsi_src_register *reg, uint32_t val)
792{
793	unsigned neg, swiz, idx, i;
794	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
795	static const unsigned swiz2tgsi[] = {
796			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
797	};
798
799	for (i = 0; i < ctx->immediate_idx; i++) {
800		swiz = i % 4;
801		idx  = i / 4;
802
803		if (ctx->so->immediates[idx].val[swiz] == val) {
804			neg = 0;
805			break;
806		}
807
808		if (ctx->so->immediates[idx].val[swiz] == -val) {
809			neg = 1;
810			break;
811		}
812	}
813
814	if (i == ctx->immediate_idx) {
815		/* need to generate a new immediate: */
816		swiz = i % 4;
817		idx  = i / 4;
818		neg  = 0;
819		ctx->so->immediates[idx].val[swiz] = val;
820		ctx->so->immediates_count = idx + 1;
821		ctx->immediate_idx++;
822	}
823
824	reg->File      = TGSI_FILE_IMMEDIATE;
825	reg->Indirect  = 0;
826	reg->Dimension = 0;
827	reg->Index     = idx;
828	reg->Absolute  = 0;
829	reg->Negate    = neg;
830	reg->SwizzleX  = swiz2tgsi[swiz];
831	reg->SwizzleY  = swiz2tgsi[swiz];
832	reg->SwizzleZ  = swiz2tgsi[swiz];
833	reg->SwizzleW  = swiz2tgsi[swiz];
834}
835
836static void
837create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
838		struct tgsi_src_register *src)
839{
840	type_t type_mov = get_ftype(ctx);
841	unsigned i;
842
843	for (i = 0; i < 4; i++) {
844		/* move to destination: */
845		if (dst->WriteMask & (1 << i)) {
846			struct ir3_instruction *instr;
847
848			if (src->Absolute || src->Negate) {
849				/* can't have abs or neg on a mov instr, so use
850				 * absneg.f instead to handle these cases:
851				 */
852				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
853			} else {
854				instr = instr_create(ctx, 1, 0);
855				instr->cat1.src_type = type_mov;
856				instr->cat1.dst_type = type_mov;
857			}
858
859			add_dst_reg(ctx, instr, dst, i);
860			add_src_reg(ctx, instr, src, src_swiz(src, i));
861		}
862	}
863}
864
865static void
866create_clamp(struct ir3_compile_context *ctx,
867		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
868		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
869{
870	struct ir3_instruction *instr;
871
872	instr = instr_create(ctx, 2, OPC_MAX_F);
873	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
874
875	instr = instr_create(ctx, 2, OPC_MIN_F);
876	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
877}
878
879static void
880create_clamp_imm(struct ir3_compile_context *ctx,
881		struct tgsi_dst_register *dst,
882		uint32_t minval, uint32_t maxval)
883{
884	struct tgsi_src_register minconst, maxconst;
885	struct tgsi_src_register src;
886
887	src_from_dst(&src, dst);
888
889	get_immediate(ctx, &minconst, minval);
890	get_immediate(ctx, &maxconst, maxval);
891
892	create_clamp(ctx, dst, &src, &minconst, &maxconst);
893}
894
895static struct tgsi_dst_register *
896get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
897{
898	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
899	unsigned i;
900	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
901		struct tgsi_src_register *src = &inst->Src[i].Register;
902		if ((src->File == dst->File) && (src->Index == dst->Index)) {
903			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
904					(src->SwizzleX == TGSI_SWIZZLE_X) &&
905					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
906					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
907					(src->SwizzleW == TGSI_SWIZZLE_W))
908				continue;
909			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
910			ctx->tmp_dst.WriteMask = dst->WriteMask;
911			dst = &ctx->tmp_dst;
912			break;
913		}
914	}
915	return dst;
916}
917
918static void
919put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
920		struct tgsi_dst_register *dst)
921{
922	/* if necessary, add mov back into original dst: */
923	if (dst != &inst->Dst[0].Register) {
924		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
925	}
926}
927
928/* helper to generate the necessary repeat and/or additional instructions
929 * to turn a scalar instruction into a vector operation:
930 */
931static void
932vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
933		struct tgsi_dst_register *dst, int nsrcs, ...)
934{
935	va_list ap;
936	int i, j, n = 0;
937
938	instr_atomic_start(ctx);
939
940	add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
941
942	va_start(ap, nsrcs);
943	for (j = 0; j < nsrcs; j++) {
944		struct tgsi_src_register *src =
945				va_arg(ap, struct tgsi_src_register *);
946		unsigned flags = va_arg(ap, unsigned);
947		struct ir3_register *reg;
948		if (flags & IR3_REG_IMMED) {
949			reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
950			/* this is an ugly cast.. should have put flags first! */
951			reg->iim_val = *(int *)&src;
952		} else {
953			reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
954		}
955		reg->flags |= flags & ~IR3_REG_NEGATE;
956		if (flags & IR3_REG_NEGATE)
957			reg->flags ^= IR3_REG_NEGATE;
958	}
959	va_end(ap);
960
961	for (i = 0; i < 4; i++) {
962		if (dst->WriteMask & (1 << i)) {
963			struct ir3_instruction *cur;
964
965			if (n++ == 0) {
966				cur = instr;
967			} else {
968				cur = instr_clone(ctx, instr);
969			}
970
971			ssa_dst(ctx, cur, dst, i);
972
973			/* fix-up dst register component: */
974			cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
975
976			/* fix-up src register component: */
977			va_start(ap, nsrcs);
978			for (j = 0; j < nsrcs; j++) {
979				struct ir3_register *reg = cur->regs[j+1];
980				struct tgsi_src_register *src =
981						va_arg(ap, struct tgsi_src_register *);
982				unsigned flags = va_arg(ap, unsigned);
983				if (reg->flags & IR3_REG_SSA) {
984					ssa_src(ctx, reg, src, src_swiz(src, i));
985				} else if (!(flags & IR3_REG_IMMED)) {
986					reg->num = regid(reg->num >> 2, src_swiz(src, i));
987				}
988			}
989			va_end(ap);
990		}
991	}
992
993	instr_atomic_end(ctx);
994}
995
996/*
997 * Handlers for TGSI instructions which do not have a 1:1 mapping to
998 * native instructions:
999 */
1000
1001static void
1002trans_clamp(const struct instr_translater *t,
1003		struct ir3_compile_context *ctx,
1004		struct tgsi_full_instruction *inst)
1005{
1006	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1007	struct tgsi_src_register *src0 = &inst->Src[0].Register;
1008	struct tgsi_src_register *src1 = &inst->Src[1].Register;
1009	struct tgsi_src_register *src2 = &inst->Src[2].Register;
1010
1011	create_clamp(ctx, dst, src0, src1, src2);
1012
1013	put_dst(ctx, inst, dst);
1014}
1015
1016/* ARL(x) = x, but mova from hrN.x to a0.. */
1017static void
1018trans_arl(const struct instr_translater *t,
1019		struct ir3_compile_context *ctx,
1020		struct tgsi_full_instruction *inst)
1021{
1022	struct ir3_instruction *instr;
1023	struct tgsi_dst_register tmp_dst;
1024	struct tgsi_src_register *tmp_src;
1025	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1026	struct tgsi_src_register *src = &inst->Src[0].Register;
1027	unsigned chan = src->SwizzleX;
1028
1029	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1030
1031	/* NOTE: we allocate a temporary from a flat register
1032	 * namespace (ignoring half vs full).  It turns out
1033	 * not to really matter since registers get reassigned
1034	 * later in ir3_ra which (hopefully!) can deal a bit
1035	 * better with mixed half and full precision.
1036	 */
1037	tmp_src = get_internal_temp(ctx, &tmp_dst);
1038
1039	/* cov.f{32,16}s16 Rtmp, Rsrc */
1040	instr = instr_create(ctx, 1, 0);
1041	instr->cat1.src_type = get_ftype(ctx);
1042	instr->cat1.dst_type = TYPE_S16;
1043	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1044	add_src_reg(ctx, instr, src, chan);
1045
1046	/* shl.b Rtmp, Rtmp, 2 */
1047	instr = instr_create(ctx, 2, OPC_SHL_B);
1048	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1049	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1050	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1051
1052	/* mova a0, Rtmp */
1053	instr = instr_create(ctx, 1, 0);
1054	instr->cat1.src_type = TYPE_S16;
1055	instr->cat1.dst_type = TYPE_S16;
1056	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1057	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1058}
1059
1060/*
1061 * texture fetch/sample instructions:
1062 */
1063
1064struct tex_info {
1065	int8_t order[4];
1066	unsigned src_wrmask, flags;
1067};
1068
1069static const struct tex_info *
1070get_tex_info(struct ir3_compile_context *ctx,
1071		struct tgsi_full_instruction *inst)
1072{
1073	static const struct tex_info tex1d = {
1074		.order = { 0, -1, -1, -1 },  /* coord.x */
1075		.src_wrmask = TGSI_WRITEMASK_XY,
1076		.flags = 0,
1077	};
1078	static const struct tex_info tex1ds = {
1079		.order = { 0, -1,  2, -1 },  /* coord.xz */
1080		.src_wrmask = TGSI_WRITEMASK_XYZ,
1081		.flags = IR3_INSTR_S,
1082	};
1083	static const struct tex_info tex2d = {
1084		.order = { 0,  1, -1, -1 },  /* coord.xy */
1085		.src_wrmask = TGSI_WRITEMASK_XY,
1086		.flags = 0,
1087	};
1088	static const struct tex_info tex2ds = {
1089		.order = { 0,  1,  2, -1 },  /* coord.xyz */
1090		.src_wrmask = TGSI_WRITEMASK_XYZ,
1091		.flags = IR3_INSTR_S,
1092	};
1093	static const struct tex_info tex3d = {
1094		.order = { 0,  1,  2, -1 },  /* coord.xyz */
1095		.src_wrmask = TGSI_WRITEMASK_XYZ,
1096		.flags = IR3_INSTR_3D,
1097	};
1098	static const struct tex_info tex3ds = {
1099		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
1100		.src_wrmask = TGSI_WRITEMASK_XYZW,
1101		.flags = IR3_INSTR_S | IR3_INSTR_3D,
1102	};
1103	static const struct tex_info txp1d = {
1104		.order = { 0, -1,  3, -1 },  /* coord.xw */
1105		.src_wrmask = TGSI_WRITEMASK_XYZ,
1106		.flags = IR3_INSTR_P,
1107	};
1108	static const struct tex_info txp1ds = {
1109		.order = { 0, -1,  2,  3 },  /* coord.xzw */
1110		.src_wrmask = TGSI_WRITEMASK_XYZW,
1111		.flags = IR3_INSTR_P | IR3_INSTR_S,
1112	};
1113	static const struct tex_info txp2d = {
1114		.order = { 0,  1,  3, -1 },  /* coord.xyw */
1115		.src_wrmask = TGSI_WRITEMASK_XYZ,
1116		.flags = IR3_INSTR_P,
1117	};
1118	static const struct tex_info txp2ds = {
1119		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
1120		.src_wrmask = TGSI_WRITEMASK_XYZW,
1121		.flags = IR3_INSTR_P | IR3_INSTR_S,
1122	};
1123	static const struct tex_info txp3d = {
1124		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
1125		.src_wrmask = TGSI_WRITEMASK_XYZW,
1126		.flags = IR3_INSTR_P | IR3_INSTR_3D,
1127	};
1128
1129	unsigned tex = inst->Texture.Texture;
1130
1131	switch (inst->Instruction.Opcode) {
1132	case TGSI_OPCODE_TEX:
1133	case TGSI_OPCODE_TXB:
1134		switch (tex) {
1135		case TGSI_TEXTURE_1D:
1136			return &tex1d;
1137		case TGSI_TEXTURE_SHADOW1D:
1138			return &tex1ds;
1139		case TGSI_TEXTURE_2D:
1140		case TGSI_TEXTURE_RECT:
1141			return &tex2d;
1142		case TGSI_TEXTURE_SHADOW2D:
1143		case TGSI_TEXTURE_SHADOWRECT:
1144			return &tex2ds;
1145		case TGSI_TEXTURE_3D:
1146		case TGSI_TEXTURE_CUBE:
1147			return &tex3d;
1148		case TGSI_TEXTURE_SHADOWCUBE:
1149			return &tex3ds;
1150		default:
1151			compile_error(ctx, "unknown texture type: %s\n",
1152					tgsi_texture_names[tex]);
1153			return NULL;
1154		}
1155		break;
1156	case TGSI_OPCODE_TXP:
1157		switch (tex) {
1158		case TGSI_TEXTURE_1D:
1159			return &txp1d;
1160		case TGSI_TEXTURE_SHADOW1D:
1161			return &txp1ds;
1162		case TGSI_TEXTURE_2D:
1163		case TGSI_TEXTURE_RECT:
1164			return &txp2d;
1165		case TGSI_TEXTURE_SHADOW2D:
1166		case TGSI_TEXTURE_SHADOWRECT:
1167			return &txp2ds;
1168		case TGSI_TEXTURE_3D:
1169		case TGSI_TEXTURE_CUBE:
1170			return &txp3d;
1171		default:
1172			compile_error(ctx, "unknown texture type: %s\n",
1173					tgsi_texture_names[tex]);
1174			break;
1175		}
1176		break;
1177	}
1178	compile_assert(ctx, 0);
1179	return NULL;
1180}
1181
1182static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1183{
1184	unsigned i;
1185	for (i = 1; (i < 4) && order[i] >= 0; i++)
1186		if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1187			return false;
1188	return true;
1189}
1190
1191static struct tgsi_src_register *
1192get_tex_coord(struct ir3_compile_context *ctx,
1193		struct tgsi_full_instruction *inst,
1194		const struct tex_info *tinf)
1195{
1196	struct tgsi_src_register *coord = &inst->Src[0].Register;
1197	struct ir3_instruction *instr;
1198	unsigned tex = inst->Texture.Texture;
1199	bool needs_mov = false;
1200
1201	/* cat5 instruction cannot seem to handle const or relative: */
1202	if (is_rel_or_const(coord))
1203		needs_mov = true;
1204
1205	/* 1D textures we fix up w/ 0.0 as 2nd coord: */
1206	if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
1207		needs_mov = true;
1208
1209	/* The texture sample instructions need to coord in successive
1210	 * registers/components (ie. src.xy but not src.yx).  And TXP
1211	 * needs the .w component in .z for 2D..  so in some cases we
1212	 * might need to emit some mov instructions to shuffle things
1213	 * around:
1214	 */
1215	if (!needs_mov)
1216		needs_mov = !check_swiz(coord, tinf->order);
1217
1218	if (needs_mov) {
1219		struct tgsi_dst_register tmp_dst;
1220		struct tgsi_src_register *tmp_src;
1221		unsigned j;
1222
1223		type_t type_mov = get_ftype(ctx);
1224
1225		/* need to move things around: */
1226		tmp_src = get_internal_temp(ctx, &tmp_dst);
1227
1228		for (j = 0; j < 4; j++) {
1229			if (tinf->order[j] < 0)
1230				continue;
1231			instr = instr_create(ctx, 1, 0);  /* mov */
1232			instr->cat1.src_type = type_mov;
1233			instr->cat1.dst_type = type_mov;
1234			add_dst_reg(ctx, instr, &tmp_dst, j);
1235			add_src_reg(ctx, instr, coord,
1236					src_swiz(coord, tinf->order[j]));
1237		}
1238
1239		/* fix up .y coord: */
1240		if ((tex == TGSI_TEXTURE_1D) ||
1241				(tex == TGSI_TEXTURE_SHADOW1D)) {
1242			instr = instr_create(ctx, 1, 0);  /* mov */
1243			instr->cat1.src_type = type_mov;
1244			instr->cat1.dst_type = type_mov;
1245			add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
1246			ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
1247		}
1248
1249		coord = tmp_src;
1250	}
1251
1252	return coord;
1253}
1254
1255static void
1256trans_samp(const struct instr_translater *t,
1257		struct ir3_compile_context *ctx,
1258		struct tgsi_full_instruction *inst)
1259{
1260	struct ir3_instruction *instr;
1261	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1262	struct tgsi_src_register *coord;
1263	struct tgsi_src_register *samp  = &inst->Src[1].Register;
1264	const struct tex_info *tinf;
1265
1266	tinf = get_tex_info(ctx, inst);
1267	coord = get_tex_coord(ctx, inst, tinf);
1268
1269	instr = instr_create(ctx, 5, t->opc);
1270	instr->cat5.type = get_ftype(ctx);
1271	instr->cat5.samp = samp->Index;
1272	instr->cat5.tex  = samp->Index;
1273	instr->flags |= tinf->flags;
1274
1275	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1276	add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
1277
1278	if (t->tgsi_opc == TGSI_OPCODE_TXB)
1279		add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleW, 0x1);
1280}
1281
1282/* DDX/DDY */
1283static void
1284trans_deriv(const struct instr_translater *t,
1285		struct ir3_compile_context *ctx,
1286		struct tgsi_full_instruction *inst)
1287{
1288	struct ir3_instruction *instr;
1289	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1290	struct tgsi_src_register *src = &inst->Src[0].Register;
1291	static const int8_t order[4] = {0, 1, 2, 3};
1292
1293	if (!check_swiz(src, order)) {
1294		struct tgsi_dst_register tmp_dst;
1295		struct tgsi_src_register *tmp_src;
1296
1297		tmp_src = get_internal_temp(ctx, &tmp_dst);
1298		create_mov(ctx, &tmp_dst, src);
1299
1300		src = tmp_src;
1301	}
1302
1303	/* This might be a workaround for hw bug?  Blob compiler always
1304	 * seems to work two components at a time for dsy/dsx.  It does
1305	 * actually seem to work in some cases (or at least some piglit
1306	 * tests) for four components at a time.  But seems more reliable
1307	 * to split this into two instructions like the blob compiler
1308	 * does:
1309	 */
1310
1311	instr = instr_create(ctx, 5, t->opc);
1312	instr->cat5.type = get_ftype(ctx);
1313	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1314	add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1315
1316	instr = instr_create(ctx, 5, t->opc);
1317	instr->cat5.type = get_ftype(ctx);
1318	add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1319	add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1320}
1321
1322/*
1323 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1324 *   cmps.f.eq tmp0, a, b
1325 *   cov.u16f16 dst, tmp0
1326 *
1327 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1328 *   cmps.f.ne tmp0, a, b
1329 *   cov.u16f16 dst, tmp0
1330 *
1331 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1332 *   cmps.f.ge tmp0, a, b
1333 *   cov.u16f16 dst, tmp0
1334 *
1335 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1336 *   cmps.f.le tmp0, a, b
1337 *   cov.u16f16 dst, tmp0
1338 *
1339 * SGT(a,b) = (a > b)  ? 1.0 : 0.0
1340 *   cmps.f.gt tmp0, a, b
1341 *   cov.u16f16 dst, tmp0
1342 *
1343 * SLT(a,b) = (a < b)  ? 1.0 : 0.0
1344 *   cmps.f.lt tmp0, a, b
1345 *   cov.u16f16 dst, tmp0
1346 *
1347 * CMP(a,b,c) = (a < 0.0) ? b : c
1348 *   cmps.f.lt tmp0, a, {0.0}
1349 *   sel.b16 dst, b, tmp0, c
1350 */
1351static void
1352trans_cmp(const struct instr_translater *t,
1353		struct ir3_compile_context *ctx,
1354		struct tgsi_full_instruction *inst)
1355{
1356	struct ir3_instruction *instr;
1357	struct tgsi_dst_register tmp_dst;
1358	struct tgsi_src_register *tmp_src;
1359	struct tgsi_src_register constval0;
1360	/* final instruction for CMP() uses orig src1 and src2: */
1361	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1362	struct tgsi_src_register *a0, *a1, *a2;
1363	unsigned condition;
1364
1365	tmp_src = get_internal_temp(ctx, &tmp_dst);
1366
1367	a0 = &inst->Src[0].Register;  /* a */
1368	a1 = &inst->Src[1].Register;  /* b */
1369
1370	switch (t->tgsi_opc) {
1371	case TGSI_OPCODE_SEQ:
1372	case TGSI_OPCODE_FSEQ:
1373		condition = IR3_COND_EQ;
1374		break;
1375	case TGSI_OPCODE_SNE:
1376	case TGSI_OPCODE_FSNE:
1377		condition = IR3_COND_NE;
1378		break;
1379	case TGSI_OPCODE_SGE:
1380	case TGSI_OPCODE_FSGE:
1381		condition = IR3_COND_GE;
1382		break;
1383	case TGSI_OPCODE_SLT:
1384	case TGSI_OPCODE_FSLT:
1385		condition = IR3_COND_LT;
1386		break;
1387	case TGSI_OPCODE_SLE:
1388		condition = IR3_COND_LE;
1389		break;
1390	case TGSI_OPCODE_SGT:
1391		condition = IR3_COND_GT;
1392		break;
1393	case TGSI_OPCODE_CMP:
1394		get_immediate(ctx, &constval0, fui(0.0));
1395		a0 = &inst->Src[0].Register;  /* a */
1396		a1 = &constval0;              /* {0.0} */
1397		condition = IR3_COND_LT;
1398		break;
1399	default:
1400		compile_assert(ctx, 0);
1401		return;
1402	}
1403
1404	if (is_const(a0) && is_const(a1))
1405		a0 = get_unconst(ctx, a0);
1406
1407	/* cmps.f.<cond> tmp, a0, a1 */
1408	instr = instr_create(ctx, 2, OPC_CMPS_F);
1409	instr->cat2.condition = condition;
1410	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1411
1412	switch (t->tgsi_opc) {
1413	case TGSI_OPCODE_SEQ:
1414	case TGSI_OPCODE_FSEQ:
1415	case TGSI_OPCODE_SGE:
1416	case TGSI_OPCODE_FSGE:
1417	case TGSI_OPCODE_SLE:
1418	case TGSI_OPCODE_SNE:
1419	case TGSI_OPCODE_FSNE:
1420	case TGSI_OPCODE_SGT:
1421	case TGSI_OPCODE_SLT:
1422	case TGSI_OPCODE_FSLT:
1423		/* cov.u16f16 dst, tmp0 */
1424		instr = instr_create(ctx, 1, 0);
1425		instr->cat1.src_type = get_utype(ctx);
1426		instr->cat1.dst_type = get_ftype(ctx);
1427		vectorize(ctx, instr, dst, 1, tmp_src, 0);
1428		break;
1429	case TGSI_OPCODE_CMP:
1430		a1 = &inst->Src[1].Register;
1431		a2 = &inst->Src[2].Register;
1432		/* sel.{b32,b16} dst, src2, tmp, src1 */
1433		instr = instr_create(ctx, 3, OPC_SEL_B32);
1434		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1435
1436		break;
1437	}
1438
1439	put_dst(ctx, inst, dst);
1440}
1441
1442/*
1443 * USNE(a,b) = (a != b) ? 1 : 0
1444 *   cmps.u32.ne dst, a, b
1445 *
1446 * USEQ(a,b) = (a == b) ? 1 : 0
1447 *   cmps.u32.eq dst, a, b
1448 *
1449 * ISGE(a,b) = (a > b) ? 1 : 0
1450 *   cmps.s32.ge dst, a, b
1451 *
1452 * USGE(a,b) = (a > b) ? 1 : 0
1453 *   cmps.u32.ge dst, a, b
1454 *
1455 * ISLT(a,b) = (a < b) ? 1 : 0
1456 *   cmps.s32.lt dst, a, b
1457 *
1458 * USLT(a,b) = (a < b) ? 1 : 0
1459 *   cmps.u32.lt dst, a, b
1460 *
1461 * UCMP(a,b,c) = (a < 0) ? b : c
1462 *   cmps.u32.lt tmp0, a, {0}
1463 *   sel.b16 dst, b, tmp0, c
1464 */
1465static void
1466trans_icmp(const struct instr_translater *t,
1467		struct ir3_compile_context *ctx,
1468		struct tgsi_full_instruction *inst)
1469{
1470	struct ir3_instruction *instr;
1471	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1472	struct tgsi_src_register constval0;
1473	struct tgsi_src_register *a0, *a1, *a2;
1474	unsigned condition;
1475
1476	a0 = &inst->Src[0].Register;  /* a */
1477	a1 = &inst->Src[1].Register;  /* b */
1478
1479	switch (t->tgsi_opc) {
1480	case TGSI_OPCODE_USNE:
1481		condition = IR3_COND_NE;
1482		break;
1483	case TGSI_OPCODE_USEQ:
1484		condition = IR3_COND_EQ;
1485		break;
1486	case TGSI_OPCODE_ISGE:
1487	case TGSI_OPCODE_USGE:
1488		condition = IR3_COND_GE;
1489		break;
1490	case TGSI_OPCODE_ISLT:
1491	case TGSI_OPCODE_USLT:
1492		condition = IR3_COND_LT;
1493		break;
1494	case TGSI_OPCODE_UCMP:
1495		get_immediate(ctx, &constval0, 0);
1496		a0 = &inst->Src[0].Register;  /* a */
1497		a1 = &constval0;              /* {0} */
1498		condition = IR3_COND_LT;
1499		break;
1500
1501	default:
1502		compile_assert(ctx, 0);
1503		return;
1504	}
1505
1506	if (is_const(a0) && is_const(a1))
1507		a0 = get_unconst(ctx, a0);
1508
1509	if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
1510		struct tgsi_dst_register tmp_dst;
1511		struct tgsi_src_register *tmp_src;
1512		tmp_src = get_internal_temp(ctx, &tmp_dst);
1513		/* cmps.u32.lt tmp, a0, a1 */
1514		instr = instr_create(ctx, 2, t->opc);
1515		instr->cat2.condition = condition;
1516		vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1517
1518		a1 = &inst->Src[1].Register;
1519		a2 = &inst->Src[2].Register;
1520		/* sel.{b32,b16} dst, src2, tmp, src1 */
1521		instr = instr_create(ctx, 3, OPC_SEL_B32);
1522		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1523	} else {
1524		/* cmps.{u32,s32}.<cond> dst, a0, a1 */
1525		instr = instr_create(ctx, 2, t->opc);
1526		instr->cat2.condition = condition;
1527		vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
1528	}
1529	put_dst(ctx, inst, dst);
1530}
1531
1532/*
1533 * Conditional / Flow control
1534 */
1535
1536static void
1537push_branch(struct ir3_compile_context *ctx, bool inv,
1538		struct ir3_instruction *instr, struct ir3_instruction *cond)
1539{
1540	unsigned int idx = ctx->branch_count++;
1541	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
1542	ctx->branch[idx].instr = instr;
1543	ctx->branch[idx].inv = inv;
1544	/* else side of branch has same condition: */
1545	if (!inv)
1546		ctx->branch[idx].cond = cond;
1547}
1548
1549static struct ir3_instruction *
1550pop_branch(struct ir3_compile_context *ctx)
1551{
1552	unsigned int idx = --ctx->branch_count;
1553	return ctx->branch[idx].instr;
1554}
1555
1556static void
1557trans_if(const struct instr_translater *t,
1558		struct ir3_compile_context *ctx,
1559		struct tgsi_full_instruction *inst)
1560{
1561	struct ir3_instruction *instr, *cond;
1562	struct tgsi_src_register *src = &inst->Src[0].Register;
1563	struct tgsi_dst_register tmp_dst;
1564	struct tgsi_src_register *tmp_src;
1565	struct tgsi_src_register constval;
1566
1567	get_immediate(ctx, &constval, fui(0.0));
1568	tmp_src = get_internal_temp(ctx, &tmp_dst);
1569
1570	if (is_const(src))
1571		src = get_unconst(ctx, src);
1572
1573	/* cmps.f.ne tmp0, b, {0.0} */
1574	instr = instr_create(ctx, 2, OPC_CMPS_F);
1575	add_dst_reg(ctx, instr, &tmp_dst, 0);
1576	add_src_reg(ctx, instr, src, src->SwizzleX);
1577	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1578	instr->cat2.condition = IR3_COND_NE;
1579
1580	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
1581	cond = instr->regs[1]->instr;
1582
1583	/* meta:flow tmp0 */
1584	instr = instr_create(ctx, -1, OPC_META_FLOW);
1585	ir3_reg_create(instr, 0, 0);  /* dummy dst */
1586	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1587
1588	push_branch(ctx, false, instr, cond);
1589	instr->flow.if_block = push_block(ctx);
1590}
1591
1592static void
1593trans_else(const struct instr_translater *t,
1594		struct ir3_compile_context *ctx,
1595		struct tgsi_full_instruction *inst)
1596{
1597	struct ir3_instruction *instr;
1598
1599	pop_block(ctx);
1600
1601	instr = pop_branch(ctx);
1602
1603	compile_assert(ctx, (instr->category == -1) &&
1604			(instr->opc == OPC_META_FLOW));
1605
1606	push_branch(ctx, true, instr, NULL);
1607	instr->flow.else_block = push_block(ctx);
1608}
1609
1610static struct ir3_instruction *
1611find_temporary(struct ir3_block *block, unsigned n)
1612{
1613	if (block->parent && !block->temporaries[n])
1614		return find_temporary(block->parent, n);
1615	return block->temporaries[n];
1616}
1617
1618static struct ir3_instruction *
1619find_output(struct ir3_block *block, unsigned n)
1620{
1621	if (block->parent && !block->outputs[n])
1622		return find_output(block->parent, n);
1623	return block->outputs[n];
1624}
1625
1626static struct ir3_instruction *
1627create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
1628		struct ir3_instruction *a, struct ir3_instruction *b)
1629{
1630	struct ir3_instruction *phi;
1631
1632	compile_assert(ctx, cond);
1633
1634	/* Either side of the condition could be null..  which
1635	 * indicates a variable written on only one side of the
1636	 * branch.  Normally this should only be variables not
1637	 * used outside of that side of the branch.  So we could
1638	 * just 'return a ? a : b;' in that case.  But for better
1639	 * defined undefined behavior we just stick in imm{0.0}.
1640	 * In the common case of a value only used within the
1641	 * one side of the branch, the PHI instruction will not
1642	 * get scheduled
1643	 */
1644	if (!a)
1645		a = create_immed(ctx, 0.0);
1646	if (!b)
1647		b = create_immed(ctx, 0.0);
1648
1649	phi = instr_create(ctx, -1, OPC_META_PHI);
1650	ir3_reg_create(phi, 0, 0);  /* dummy dst */
1651	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1652	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1653	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1654
1655	return phi;
1656}
1657
1658static void
1659trans_endif(const struct instr_translater *t,
1660		struct ir3_compile_context *ctx,
1661		struct tgsi_full_instruction *inst)
1662{
1663	struct ir3_instruction *instr;
1664	struct ir3_block *ifb, *elseb;
1665	struct ir3_instruction **ifout, **elseout;
1666	unsigned i, ifnout = 0, elsenout = 0;
1667
1668	pop_block(ctx);
1669
1670	instr = pop_branch(ctx);
1671
1672	compile_assert(ctx, (instr->category == -1) &&
1673			(instr->opc == OPC_META_FLOW));
1674
1675	ifb = instr->flow.if_block;
1676	elseb = instr->flow.else_block;
1677	/* if there is no else block, the parent block is used for the
1678	 * branch-not-taken src of the PHI instructions:
1679	 */
1680	if (!elseb)
1681		elseb = ifb->parent;
1682
1683	/* worst case sizes: */
1684	ifnout = ifb->ntemporaries + ifb->noutputs;
1685	elsenout = elseb->ntemporaries + elseb->noutputs;
1686
1687	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1688	if (elseb != ifb->parent)
1689		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1690
1691	ifnout = 0;
1692	elsenout = 0;
1693
1694	/* generate PHI instructions for any temporaries written: */
1695	for (i = 0; i < ifb->ntemporaries; i++) {
1696		struct ir3_instruction *a = ifb->temporaries[i];
1697		struct ir3_instruction *b = elseb->temporaries[i];
1698
1699		/* if temporary written in if-block, or if else block
1700		 * is present and temporary written in else-block:
1701		 */
1702		if (a || ((elseb != ifb->parent) && b)) {
1703			struct ir3_instruction *phi;
1704
1705			/* if only written on one side, find the closest
1706			 * enclosing update on other side:
1707			 */
1708			if (!a)
1709				a = find_temporary(ifb, i);
1710			if (!b)
1711				b = find_temporary(elseb, i);
1712
1713			ifout[ifnout] = a;
1714			a = create_output(ifb, a, ifnout++);
1715
1716			if (elseb != ifb->parent) {
1717				elseout[elsenout] = b;
1718				b = create_output(elseb, b, elsenout++);
1719			}
1720
1721			phi = create_phi(ctx, instr, a, b);
1722			ctx->block->temporaries[i] = phi;
1723		}
1724	}
1725
1726	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
1727
1728	/* .. and any outputs written: */
1729	for (i = 0; i < ifb->noutputs; i++) {
1730		struct ir3_instruction *a = ifb->outputs[i];
1731		struct ir3_instruction *b = elseb->outputs[i];
1732
1733		/* if output written in if-block, or if else block
1734		 * is present and output written in else-block:
1735		 */
1736		if (a || ((elseb != ifb->parent) && b)) {
1737			struct ir3_instruction *phi;
1738
1739			/* if only written on one side, find the closest
1740			 * enclosing update on other side:
1741			 */
1742			if (!a)
1743				a = find_output(ifb, i);
1744			if (!b)
1745				b = find_output(elseb, i);
1746
1747			ifout[ifnout] = a;
1748			a = create_output(ifb, a, ifnout++);
1749
1750			if (elseb != ifb->parent) {
1751				elseout[elsenout] = b;
1752				b = create_output(elseb, b, elsenout++);
1753			}
1754
1755			phi = create_phi(ctx, instr, a, b);
1756			ctx->block->outputs[i] = phi;
1757		}
1758	}
1759
1760	ifb->noutputs = ifnout;
1761	ifb->outputs = ifout;
1762
1763	if (elseb != ifb->parent) {
1764		elseb->noutputs = elsenout;
1765		elseb->outputs = elseout;
1766	}
1767
1768	// TODO maybe we want to compact block->inputs?
1769}
1770
1771/*
1772 * Kill
1773 */
1774
1775static void
1776trans_kill(const struct instr_translater *t,
1777		struct ir3_compile_context *ctx,
1778		struct tgsi_full_instruction *inst)
1779{
1780	struct ir3_instruction *instr, *immed, *cond = NULL;
1781	bool inv = false;
1782
1783	switch (t->tgsi_opc) {
1784	case TGSI_OPCODE_KILL:
1785		/* unconditional kill, use enclosing if condition: */
1786		if (ctx->branch_count > 0) {
1787			unsigned int idx = ctx->branch_count - 1;
1788			cond = ctx->branch[idx].cond;
1789			inv = ctx->branch[idx].inv;
1790		} else {
1791			cond = create_immed(ctx, 1.0);
1792		}
1793
1794		break;
1795	}
1796
1797	compile_assert(ctx, cond);
1798
1799	immed = create_immed(ctx, 0.0);
1800
1801	/* cmps.f.ne p0.x, cond, {0.0} */
1802	instr = instr_create(ctx, 2, OPC_CMPS_F);
1803	instr->cat2.condition = IR3_COND_NE;
1804	ir3_reg_create(instr, regid(REG_P0, 0), 0);
1805	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
1806	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
1807	cond = instr;
1808
1809	/* kill p0.x */
1810	instr = instr_create(ctx, 0, OPC_KILL);
1811	instr->cat0.inv = inv;
1812	ir3_reg_create(instr, 0, 0);  /* dummy dst */
1813	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
1814
1815	ctx->kill[ctx->kill_count++] = instr;
1816}
1817
1818/*
1819 * Kill-If
1820 */
1821
1822static void
1823trans_killif(const struct instr_translater *t,
1824		struct ir3_compile_context *ctx,
1825		struct tgsi_full_instruction *inst)
1826{
1827	struct tgsi_src_register *src = &inst->Src[0].Register;
1828	struct ir3_instruction *instr, *immed, *cond = NULL;
1829	bool inv = false;
1830
1831	immed = create_immed(ctx, 0.0);
1832
1833	/* cmps.f.ne p0.x, cond, {0.0} */
1834	instr = instr_create(ctx, 2, OPC_CMPS_F);
1835	instr->cat2.condition = IR3_COND_NE;
1836	ir3_reg_create(instr, regid(REG_P0, 0), 0);
1837	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
1838	add_src_reg(ctx, instr, src, src->SwizzleX);
1839
1840	cond = instr;
1841
1842	/* kill p0.x */
1843	instr = instr_create(ctx, 0, OPC_KILL);
1844	instr->cat0.inv = inv;
1845	ir3_reg_create(instr, 0, 0);  /* dummy dst */
1846	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
1847
1848	ctx->kill[ctx->kill_count++] = instr;
1849
1850}
1851/*
1852 * I2F / U2F / F2I / F2U
1853 */
1854
1855static void
1856trans_cov(const struct instr_translater *t,
1857		struct ir3_compile_context *ctx,
1858		struct tgsi_full_instruction *inst)
1859{
1860	struct ir3_instruction *instr;
1861	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1862	struct tgsi_src_register *src = &inst->Src[0].Register;
1863
1864	// cov.f32s32 dst, tmp0 /
1865	instr = instr_create(ctx, 1, 0);
1866	switch (t->tgsi_opc) {
1867	case TGSI_OPCODE_U2F:
1868		instr->cat1.src_type = TYPE_U32;
1869		instr->cat1.dst_type = TYPE_F32;
1870		break;
1871	case TGSI_OPCODE_I2F:
1872		instr->cat1.src_type = TYPE_S32;
1873		instr->cat1.dst_type = TYPE_F32;
1874		break;
1875	case TGSI_OPCODE_F2U:
1876		instr->cat1.src_type = TYPE_F32;
1877		instr->cat1.dst_type = TYPE_U32;
1878		break;
1879	case TGSI_OPCODE_F2I:
1880		instr->cat1.src_type = TYPE_F32;
1881		instr->cat1.dst_type = TYPE_S32;
1882		break;
1883
1884	}
1885	vectorize(ctx, instr, dst, 1, src, 0);
1886}
1887
1888/*
1889 * Handlers for TGSI instructions which do have 1:1 mapping to native
1890 * instructions:
1891 */
1892
1893static void
1894instr_cat0(const struct instr_translater *t,
1895		struct ir3_compile_context *ctx,
1896		struct tgsi_full_instruction *inst)
1897{
1898	instr_create(ctx, 0, t->opc);
1899}
1900
1901static void
1902instr_cat1(const struct instr_translater *t,
1903		struct ir3_compile_context *ctx,
1904		struct tgsi_full_instruction *inst)
1905{
1906	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1907	struct tgsi_src_register *src = &inst->Src[0].Register;
1908	create_mov(ctx, dst, src);
1909	put_dst(ctx, inst, dst);
1910}
1911
1912static void
1913instr_cat2(const struct instr_translater *t,
1914		struct ir3_compile_context *ctx,
1915		struct tgsi_full_instruction *inst)
1916{
1917	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1918	struct tgsi_src_register *src0 = &inst->Src[0].Register;
1919	struct tgsi_src_register *src1 = &inst->Src[1].Register;
1920	struct ir3_instruction *instr;
1921	unsigned src0_flags = 0, src1_flags = 0;
1922
1923	switch (t->tgsi_opc) {
1924	case TGSI_OPCODE_ABS:
1925	case TGSI_OPCODE_IABS:
1926		src0_flags = IR3_REG_ABS;
1927		break;
1928	case TGSI_OPCODE_SUB:
1929	case TGSI_OPCODE_INEG:
1930		src1_flags = IR3_REG_NEGATE;
1931		break;
1932	}
1933
1934	switch (t->opc) {
1935	case OPC_ABSNEG_F:
1936	case OPC_ABSNEG_S:
1937	case OPC_CLZ_B:
1938	case OPC_CLZ_S:
1939	case OPC_SIGN_F:
1940	case OPC_FLOOR_F:
1941	case OPC_CEIL_F:
1942	case OPC_RNDNE_F:
1943	case OPC_RNDAZ_F:
1944	case OPC_TRUNC_F:
1945	case OPC_NOT_B:
1946	case OPC_BFREV_B:
1947	case OPC_SETRM:
1948	case OPC_CBITS_B:
1949		/* these only have one src reg */
1950		instr = instr_create(ctx, 2, t->opc);
1951		vectorize(ctx, instr, dst, 1, src0, src0_flags);
1952		break;
1953	default:
1954		if (is_const(src0) && is_const(src1))
1955			src0 = get_unconst(ctx, src0);
1956
1957		instr = instr_create(ctx, 2, t->opc);
1958		vectorize(ctx, instr, dst, 2, src0, src0_flags,
1959				src1, src1_flags);
1960		break;
1961	}
1962
1963	put_dst(ctx, inst, dst);
1964}
1965
1966static void
1967instr_cat3(const struct instr_translater *t,
1968		struct ir3_compile_context *ctx,
1969		struct tgsi_full_instruction *inst)
1970{
1971	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1972	struct tgsi_src_register *src0 = &inst->Src[0].Register;
1973	struct tgsi_src_register *src1 = &inst->Src[1].Register;
1974	struct ir3_instruction *instr;
1975
1976	/* in particular, can't handle const for src1 for cat3..
1977	 * for mad, we can swap first two src's if needed:
1978	 */
1979	if (is_rel_or_const(src1)) {
1980		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
1981			struct tgsi_src_register *tmp;
1982			tmp = src0;
1983			src0 = src1;
1984			src1 = tmp;
1985		} else {
1986			src1 = get_unconst(ctx, src1);
1987		}
1988	}
1989
1990	instr = instr_create(ctx, 3, t->opc);
1991	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
1992			&inst->Src[2].Register, 0);
1993	put_dst(ctx, inst, dst);
1994}
1995
1996static void
1997instr_cat4(const struct instr_translater *t,
1998		struct ir3_compile_context *ctx,
1999		struct tgsi_full_instruction *inst)
2000{
2001	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2002	struct tgsi_src_register *src = &inst->Src[0].Register;
2003	struct ir3_instruction *instr;
2004	unsigned i;
2005
2006	/* seems like blob compiler avoids const as src.. */
2007	if (is_const(src))
2008		src = get_unconst(ctx, src);
2009
2010	/* we need to replicate into each component: */
2011	for (i = 0; i < 4; i++) {
2012		if (dst->WriteMask & (1 << i)) {
2013			instr = instr_create(ctx, 4, t->opc);
2014			add_dst_reg(ctx, instr, dst, i);
2015			add_src_reg(ctx, instr, src, src->SwizzleX);
2016		}
2017	}
2018
2019	put_dst(ctx, inst, dst);
2020}
2021
2022static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2023#define INSTR(n, f, ...) \
2024	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2025
2026	INSTR(MOV,          instr_cat1),
2027	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
2028	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
2029	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
2030	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
2031	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
2032	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
2033	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
2034	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
2035	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
2036	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
2037	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
2038	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
2039	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
2040	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2041	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
2042	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
2043	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
2044	INSTR(UMUL,         instr_cat2, .opc = OPC_MUL_U),
2045	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
2046	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
2047	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
2048	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
2049	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
2050	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2051	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2052	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
2053	INSTR(CLAMP,        trans_clamp),
2054	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
2055	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
2056	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
2057	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
2058	INSTR(ARL,          trans_arl),
2059	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
2060	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
2061	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
2062	INSTR(COS,          instr_cat4, .opc = OPC_COS),
2063	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
2064	INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
2065	INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
2066	INSTR(TXB,          trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB),
2067	INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
2068	INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
2069	INSTR(SGT,          trans_cmp),
2070	INSTR(SLT,          trans_cmp),
2071	INSTR(FSLT,         trans_cmp),
2072	INSTR(SGE,          trans_cmp),
2073	INSTR(FSGE,         trans_cmp),
2074	INSTR(SLE,          trans_cmp),
2075	INSTR(SNE,          trans_cmp),
2076	INSTR(FSNE,         trans_cmp),
2077	INSTR(SEQ,          trans_cmp),
2078	INSTR(FSEQ,         trans_cmp),
2079	INSTR(CMP,          trans_cmp),
2080	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
2081	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
2082	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
2083	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
2084	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
2085	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
2086	INSTR(UCMP,         trans_icmp, .opc = OPC_CMPS_U),
2087	INSTR(IF,           trans_if),
2088	INSTR(UIF,          trans_if),
2089	INSTR(ELSE,         trans_else),
2090	INSTR(ENDIF,        trans_endif),
2091	INSTR(END,          instr_cat0, .opc = OPC_END),
2092	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
2093	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
2094	INSTR(I2F,          trans_cov),
2095	INSTR(U2F,          trans_cov),
2096	INSTR(F2I,          trans_cov),
2097	INSTR(F2U,          trans_cov),
2098};
2099
2100static ir3_semantic
2101decl_semantic(const struct tgsi_declaration_semantic *sem)
2102{
2103	return ir3_semantic_name(sem->Name, sem->Index);
2104}
2105
2106static struct ir3_instruction *
2107decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2108		unsigned j, unsigned inloc)
2109{
2110	struct ir3_instruction *instr;
2111	struct ir3_register *src;
2112
2113	/* bary.f dst, #inloc, r0.x */
2114	instr = instr_create(ctx, 2, OPC_BARY_F);
2115	ir3_reg_create(instr, regid, 0);   /* dummy dst */
2116	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2117	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
2118	src->wrmask = 0x3;
2119	src->instr = ctx->frag_pos;
2120
2121	return instr;
2122}
2123
2124/* TGSI_SEMANTIC_POSITION
2125 * """"""""""""""""""""""
2126 *
2127 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
2128 * fragment shader input contains the fragment's window position.  The X
2129 * component starts at zero and always increases from left to right.
2130 * The Y component starts at zero and always increases but Y=0 may either
2131 * indicate the top of the window or the bottom depending on the fragment
2132 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
2133 * The Z coordinate ranges from 0 to 1 to represent depth from the front
2134 * to the back of the Z buffer.  The W component contains the reciprocol
2135 * of the interpolated vertex position W component.
2136 */
2137static struct ir3_instruction *
2138decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
2139		unsigned j)
2140{
2141	struct ir3_instruction *instr, *src;
2142
2143	compile_assert(ctx, !ctx->frag_coord[j]);
2144
2145	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
2146
2147
2148	switch (j) {
2149	case 0: /* .x */
2150	case 1: /* .y */
2151		/* for frag_coord, we get unsigned values.. we need
2152		 * to subtract (integer) 8 and divide by 16 (right-
2153		 * shift by 4) then convert to float:
2154		 */
2155
2156		/* add.s tmp, src, -8 */
2157		instr = instr_create(ctx, 2, OPC_ADD_S);
2158		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2159		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
2160		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
2161		src = instr;
2162
2163		/* shr.b tmp, tmp, 4 */
2164		instr = instr_create(ctx, 2, OPC_SHR_B);
2165		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2166		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2167		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
2168		src = instr;
2169
2170		/* mov.u32f32 dst, tmp */
2171		instr = instr_create(ctx, 1, 0);
2172		instr->cat1.src_type = TYPE_U32;
2173		instr->cat1.dst_type = TYPE_F32;
2174		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2175		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2176
2177		break;
2178	case 2: /* .z */
2179	case 3: /* .w */
2180		/* seems that we can use these as-is: */
2181		instr = ctx->frag_coord[j];
2182		break;
2183	default:
2184		compile_error(ctx, "invalid channel\n");
2185		instr = create_immed(ctx, 0.0);
2186		break;
2187	}
2188
2189	return instr;
2190}
2191
2192/* TGSI_SEMANTIC_FACE
2193 * """"""""""""""""""
2194 *
2195 * This label applies to fragment shader inputs only and indicates that
2196 * the register contains front/back-face information of the form (F, 0,
2197 * 0, 1).  The first component will be positive when the fragment belongs
2198 * to a front-facing polygon, and negative when the fragment belongs to a
2199 * back-facing polygon.
2200 */
2201static struct ir3_instruction *
2202decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
2203		unsigned j)
2204{
2205	struct ir3_instruction *instr, *src;
2206
2207	switch (j) {
2208	case 0: /* .x */
2209		compile_assert(ctx, !ctx->frag_face);
2210
2211		ctx->frag_face = create_input(ctx->block, NULL, 0);
2212
2213		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
2214		 * positive vs negative float.. and piglit further seems to
2215		 * expect -1.0 or 1.0:
2216		 *
2217		 *    mul.s tmp, hr0.x, 2
2218		 *    add.s tmp, tmp, 1
2219		 *    mov.s16f32, dst, tmp
2220		 *
2221		 */
2222
2223		instr = instr_create(ctx, 2, OPC_MUL_S);
2224		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2225		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
2226		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
2227		src = instr;
2228
2229		instr = instr_create(ctx, 2, OPC_ADD_S);
2230		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2231		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2232		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2233		src = instr;
2234
2235		instr = instr_create(ctx, 1, 0); /* mov */
2236		instr->cat1.src_type = TYPE_S32;
2237		instr->cat1.dst_type = TYPE_F32;
2238		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2239		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2240
2241		break;
2242	case 1: /* .y */
2243	case 2: /* .z */
2244		instr = create_immed(ctx, 0.0);
2245		break;
2246	case 3: /* .w */
2247		instr = create_immed(ctx, 1.0);
2248		break;
2249	default:
2250		compile_error(ctx, "invalid channel\n");
2251		instr = create_immed(ctx, 0.0);
2252		break;
2253	}
2254
2255	return instr;
2256}
2257
2258static void
2259decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2260{
2261	struct ir3_shader_variant *so = ctx->so;
2262	unsigned name = decl->Semantic.Name;
2263	unsigned i;
2264
2265	/* I don't think we should get frag shader input without
2266	 * semantic info?  Otherwise how do inputs get linked to
2267	 * vert outputs?
2268	 */
2269	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
2270			decl->Declaration.Semantic);
2271
2272	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2273		unsigned n = so->inputs_count++;
2274		unsigned r = regid(i, 0);
2275		unsigned ncomp, j;
2276
2277		/* we'll figure out the actual components used after scheduling */
2278		ncomp = 4;
2279
2280		DBG("decl in -> r%d", i);
2281
2282		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
2283
2284		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
2285		so->inputs[n].compmask = (1 << ncomp) - 1;
2286		so->inputs[n].regid = r;
2287		so->inputs[n].inloc = ctx->next_inloc;
2288
2289		for (j = 0; j < ncomp; j++) {
2290			struct ir3_instruction *instr = NULL;
2291
2292			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2293				/* for fragment shaders, POSITION and FACE are handled
2294				 * specially, not using normal varying / bary.f
2295				 */
2296				if (name == TGSI_SEMANTIC_POSITION) {
2297					so->inputs[n].bary = false;
2298					so->frag_coord = true;
2299					instr = decl_in_frag_coord(ctx, r + j, j);
2300				} else if (name == TGSI_SEMANTIC_FACE) {
2301					so->inputs[n].bary = false;
2302					so->frag_face = true;
2303					instr = decl_in_frag_face(ctx, r + j, j);
2304				} else {
2305					so->inputs[n].bary = true;
2306					instr = decl_in_frag_bary(ctx, r + j, j,
2307							so->inputs[n].inloc + j - 8);
2308				}
2309			} else {
2310				instr = create_input(ctx->block, NULL, (i * 4) + j);
2311			}
2312
2313			ctx->block->inputs[(i * 4) + j] = instr;
2314		}
2315
2316		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
2317			ctx->next_inloc += ncomp;
2318			so->total_in += ncomp;
2319		}
2320	}
2321}
2322
2323static void
2324decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2325{
2326	struct ir3_shader_variant *so = ctx->so;
2327	unsigned comp = 0;
2328	unsigned name = decl->Semantic.Name;
2329	unsigned i;
2330
2331	compile_assert(ctx, decl->Declaration.Semantic);
2332
2333	DBG("decl out[%d] -> r%d", name, decl->Range.First);
2334
2335	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2336		switch (name) {
2337		case TGSI_SEMANTIC_POSITION:
2338			so->writes_pos = true;
2339			break;
2340		case TGSI_SEMANTIC_PSIZE:
2341			so->writes_psize = true;
2342			break;
2343		case TGSI_SEMANTIC_COLOR:
2344		case TGSI_SEMANTIC_BCOLOR:
2345		case TGSI_SEMANTIC_GENERIC:
2346		case TGSI_SEMANTIC_FOG:
2347		case TGSI_SEMANTIC_TEXCOORD:
2348			break;
2349		default:
2350			compile_error(ctx, "unknown VS semantic name: %s\n",
2351					tgsi_semantic_names[name]);
2352		}
2353	} else {
2354		switch (name) {
2355		case TGSI_SEMANTIC_POSITION:
2356			comp = 2;  /* tgsi will write to .z component */
2357			so->writes_pos = true;
2358			break;
2359		case TGSI_SEMANTIC_COLOR:
2360			break;
2361		default:
2362			compile_error(ctx, "unknown FS semantic name: %s\n",
2363					tgsi_semantic_names[name]);
2364		}
2365	}
2366
2367	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2368		unsigned n = so->outputs_count++;
2369		unsigned ncomp, j;
2370
2371		ncomp = 4;
2372
2373		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2374
2375		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
2376		so->outputs[n].regid = regid(i, comp);
2377
2378		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
2379		 * which if the output is actually assigned will be over-
2380		 * written
2381		 */
2382		for (j = 0; j < ncomp; j++)
2383			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
2384	}
2385}
2386
2387/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
2388 * for a fragment shader are just bary.f instructions.  The *actual* inputs
2389 * from the hw perspective are the frag_pos and optionally frag_coord and
2390 * frag_face.
2391 */
2392static void
2393fixup_frag_inputs(struct ir3_compile_context *ctx)
2394{
2395	struct ir3_shader_variant *so = ctx->so;
2396	struct ir3_block *block = ctx->block;
2397	struct ir3_instruction **inputs;
2398	struct ir3_instruction *instr;
2399	int n, regid = 0;
2400
2401	block->ninputs = 0;
2402
2403	n  = 4;  /* always have frag_pos */
2404	n += COND(so->frag_face, 4);
2405	n += COND(so->frag_coord, 4);
2406
2407	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2408
2409	if (so->frag_face) {
2410		/* this ultimately gets assigned to hr0.x so doesn't conflict
2411		 * with frag_coord/frag_pos..
2412		 */
2413		inputs[block->ninputs++] = ctx->frag_face;
2414		ctx->frag_face->regs[0]->num = 0;
2415
2416		/* remaining channels not used, but let's avoid confusing
2417		 * other parts that expect inputs to come in groups of vec4
2418		 */
2419		inputs[block->ninputs++] = NULL;
2420		inputs[block->ninputs++] = NULL;
2421		inputs[block->ninputs++] = NULL;
2422	}
2423
2424	/* since we don't know where to set the regid for frag_coord,
2425	 * we have to use r0.x for it.  But we don't want to *always*
2426	 * use r1.x for frag_pos as that could increase the register
2427	 * footprint on simple shaders:
2428	 */
2429	if (so->frag_coord) {
2430		ctx->frag_coord[0]->regs[0]->num = regid++;
2431		ctx->frag_coord[1]->regs[0]->num = regid++;
2432		ctx->frag_coord[2]->regs[0]->num = regid++;
2433		ctx->frag_coord[3]->regs[0]->num = regid++;
2434
2435		inputs[block->ninputs++] = ctx->frag_coord[0];
2436		inputs[block->ninputs++] = ctx->frag_coord[1];
2437		inputs[block->ninputs++] = ctx->frag_coord[2];
2438		inputs[block->ninputs++] = ctx->frag_coord[3];
2439	}
2440
2441	/* we always have frag_pos: */
2442	so->pos_regid = regid;
2443
2444	/* r0.x */
2445	instr = create_input(block, NULL, block->ninputs);
2446	instr->regs[0]->num = regid++;
2447	inputs[block->ninputs++] = instr;
2448	ctx->frag_pos->regs[1]->instr = instr;
2449
2450	/* r0.y */
2451	instr = create_input(block, NULL, block->ninputs);
2452	instr->regs[0]->num = regid++;
2453	inputs[block->ninputs++] = instr;
2454	ctx->frag_pos->regs[2]->instr = instr;
2455
2456	block->inputs = inputs;
2457}
2458
2459static void
2460compile_instructions(struct ir3_compile_context *ctx)
2461{
2462	push_block(ctx);
2463
2464	/* for fragment shader, we have a single input register (usually
2465	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
2466	 */
2467	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2468		struct ir3_instruction *instr;
2469		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
2470		ir3_reg_create(instr, 0, 0);
2471		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
2472		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
2473		ctx->frag_pos = instr;
2474	}
2475
2476	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
2477		tgsi_parse_token(&ctx->parser);
2478
2479		switch (ctx->parser.FullToken.Token.Type) {
2480		case TGSI_TOKEN_TYPE_DECLARATION: {
2481			struct tgsi_full_declaration *decl =
2482					&ctx->parser.FullToken.FullDeclaration;
2483			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
2484				decl_out(ctx, decl);
2485			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
2486				decl_in(ctx, decl);
2487			}
2488			break;
2489		}
2490		case TGSI_TOKEN_TYPE_IMMEDIATE: {
2491			/* TODO: if we know the immediate is small enough, and only
2492			 * used with instructions that can embed an immediate, we
2493			 * can skip this:
2494			 */
2495			struct tgsi_full_immediate *imm =
2496					&ctx->parser.FullToken.FullImmediate;
2497			unsigned n = ctx->so->immediates_count++;
2498			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
2499			memcpy(ctx->so->immediates[n].val, imm->u, 16);
2500			break;
2501		}
2502		case TGSI_TOKEN_TYPE_INSTRUCTION: {
2503			struct tgsi_full_instruction *inst =
2504					&ctx->parser.FullToken.FullInstruction;
2505			unsigned opc = inst->Instruction.Opcode;
2506			const struct instr_translater *t = &translaters[opc];
2507
2508			if (t->fxn) {
2509				t->fxn(t, ctx, inst);
2510				ctx->num_internal_temps = 0;
2511			} else {
2512				compile_error(ctx, "unknown TGSI opc: %s\n",
2513						tgsi_get_opcode_name(opc));
2514			}
2515
2516			switch (inst->Instruction.Saturate) {
2517			case TGSI_SAT_ZERO_ONE:
2518				create_clamp_imm(ctx, &inst->Dst[0].Register,
2519						fui(0.0), fui(1.0));
2520				break;
2521			case TGSI_SAT_MINUS_PLUS_ONE:
2522				create_clamp_imm(ctx, &inst->Dst[0].Register,
2523						fui(-1.0), fui(1.0));
2524				break;
2525			}
2526
2527			instr_finish(ctx);
2528
2529			break;
2530		}
2531		default:
2532			break;
2533		}
2534	}
2535}
2536
2537static void
2538compile_dump(struct ir3_compile_context *ctx)
2539{
2540	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
2541	static unsigned n = 0;
2542	char fname[16];
2543	FILE *f;
2544	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
2545	f = fopen(fname, "w");
2546	if (!f)
2547		return;
2548	ir3_block_depth(ctx->block);
2549	ir3_dump(ctx->ir, name, ctx->block, f);
2550	fclose(f);
2551}
2552
2553int
2554ir3_compile_shader(struct ir3_shader_variant *so,
2555		const struct tgsi_token *tokens, struct ir3_shader_key key,
2556		bool cp)
2557{
2558	struct ir3_compile_context ctx;
2559	struct ir3_block *block;
2560	struct ir3_instruction **inputs;
2561	unsigned i, j, actual_in;
2562	int ret = 0;
2563
2564	assert(!so->ir);
2565
2566	so->ir = ir3_create();
2567
2568	assert(so->ir);
2569
2570	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
2571		DBG("INIT failed!");
2572		ret = -1;
2573		goto out;
2574	}
2575
2576	compile_instructions(&ctx);
2577
2578	block = ctx.block;
2579
2580	/* keep track of the inputs from TGSI perspective.. */
2581	inputs = block->inputs;
2582
2583	/* but fixup actual inputs for frag shader: */
2584	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
2585		fixup_frag_inputs(&ctx);
2586
2587	/* at this point, for binning pass, throw away unneeded outputs: */
2588	if (key.binning_pass) {
2589		for (i = 0, j = 0; i < so->outputs_count; i++) {
2590			unsigned name = sem2name(so->outputs[i].semantic);
2591			unsigned idx = sem2name(so->outputs[i].semantic);
2592
2593			/* throw away everything but first position/psize */
2594			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
2595					(name == TGSI_SEMANTIC_PSIZE))) {
2596				if (i != j) {
2597					so->outputs[j] = so->outputs[i];
2598					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
2599					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
2600					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
2601					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
2602				}
2603				j++;
2604			}
2605		}
2606		so->outputs_count = j;
2607		block->noutputs = j * 4;
2608	}
2609
2610	/* for rendering to alpha format, we only need the .w component,
2611	 * and we need it to be in the .x position:
2612	 */
2613	if (key.alpha) {
2614		for (i = 0, j = 0; i < so->outputs_count; i++) {
2615			unsigned name = sem2name(so->outputs[i].semantic);
2616
2617			/* move .w component to .x and discard others: */
2618			if (name == TGSI_SEMANTIC_COLOR) {
2619				block->outputs[(i*4)+0] = block->outputs[(i*4)+3];
2620				block->outputs[(i*4)+1] = NULL;
2621				block->outputs[(i*4)+2] = NULL;
2622				block->outputs[(i*4)+3] = NULL;
2623			}
2624		}
2625	}
2626
2627	/* at this point, we want the kill's in the outputs array too,
2628	 * so that they get scheduled (since they have no dst).. we've
2629	 * already ensured that the array is big enough in push_block():
2630	 */
2631	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2632		for (i = 0; i < ctx.kill_count; i++)
2633			block->outputs[block->noutputs++] = ctx.kill[i];
2634	}
2635
2636	if (fd_mesa_debug & FD_DBG_OPTDUMP)
2637		compile_dump(&ctx);
2638
2639	ret = ir3_block_flatten(block);
2640	if (ret < 0) {
2641		DBG("FLATTEN failed!");
2642		goto out;
2643	}
2644	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
2645		compile_dump(&ctx);
2646
2647	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2648		printf("BEFORE CP:\n");
2649		ir3_dump_instr_list(block->head);
2650	}
2651
2652	if (cp)
2653		ir3_block_cp(block);
2654
2655	if (fd_mesa_debug & FD_DBG_OPTDUMP)
2656		compile_dump(&ctx);
2657
2658	ir3_block_depth(block);
2659
2660	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2661		printf("AFTER DEPTH:\n");
2662		ir3_dump_instr_list(block->head);
2663	}
2664
2665	ret = ir3_block_sched(block);
2666	if (ret) {
2667		DBG("SCHED failed!");
2668		goto out;
2669	}
2670
2671	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2672		printf("AFTER SCHED:\n");
2673		ir3_dump_instr_list(block->head);
2674	}
2675
2676	ret = ir3_block_ra(block, so->type, key.half_precision,
2677			so->frag_coord, so->frag_face, &so->has_samp);
2678	if (ret) {
2679		DBG("RA failed!");
2680		goto out;
2681	}
2682
2683	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2684		printf("AFTER RA:\n");
2685		ir3_dump_instr_list(block->head);
2686	}
2687
2688	/* fixup input/outputs: */
2689	for (i = 0; i < so->outputs_count; i++) {
2690		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
2691		/* preserve hack for depth output.. tgsi writes depth to .z,
2692		 * but what we give the hw is the scalar register:
2693		 */
2694		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
2695			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
2696			so->outputs[i].regid += 2;
2697	}
2698	/* Note that some or all channels of an input may be unused: */
2699	actual_in = 0;
2700	for (i = 0; i < so->inputs_count; i++) {
2701		unsigned j, regid = ~0, compmask = 0;
2702		so->inputs[i].ncomp = 0;
2703		for (j = 0; j < 4; j++) {
2704			struct ir3_instruction *in = inputs[(i*4) + j];
2705			if (in) {
2706				compmask |= (1 << j);
2707				regid = in->regs[0]->num - j;
2708				actual_in++;
2709				so->inputs[i].ncomp++;
2710			}
2711		}
2712		so->inputs[i].regid = regid;
2713		so->inputs[i].compmask = compmask;
2714	}
2715
2716	/* fragment shader always gets full vec4's even if it doesn't
2717	 * fetch all components, but vertex shader we need to update
2718	 * with the actual number of components fetch, otherwise thing
2719	 * will hang due to mismaptch between VFD_DECODE's and
2720	 * TOTALATTRTOVS
2721	 */
2722	if (so->type == SHADER_VERTEX)
2723		so->total_in = actual_in;
2724
2725out:
2726	if (ret) {
2727		ir3_destroy(so->ir);
2728		so->ir = NULL;
2729	}
2730	compile_free(&ctx);
2731
2732	return ret;
2733}
2734